library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(readxl)
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.3
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(sandwich)
## Warning: package 'sandwich' was built under R version 4.5.3
fema_data <- read_excel("fema_data.xlsx", sheet = "Core Survey", skip = 1)
## Warning: Expecting logical in LS1120 / R1120C331: got 'there is no third
## gender'
## Warning: Expecting logical in MO1246 / R1246C353: got 'normal'
## Warning: Expecting logical in MO1414 / R1414C353: got 'Fluid'
## Warning: Expecting logical in LS1963 / R1963C331: got 'Transgender'
## Warning: Expecting logical in MO2184 / R2184C353: got 'Genderfluid'
## Warning: Expecting logical in LS2311 / R2311C331: got 'Trans'
## Warning: Expecting logical in MO2699 / R2699C353: got 'Transgender'
## Warning: Expecting logical in MO2830 / R2830C353: got 'Nothing in particular'
## Warning: Expecting logical in MO3048 / R3048C353: got 'Good'
## Warning: Expecting logical in MO3304 / R3304C353: got 'Transgender'
## Warning: Expecting logical in LS4642 / R4642C331: got 'Transgender Female'
## Warning: Expecting logical in MO5355 / R5355C353: got 'Femal'
## Warning: Expecting logical in LS5609 / R5609C331: got 'There are two sexes,
## male and female. I'm a male.'
## Warning: Expecting logical in MO6038 / R6038C353: got 'Transgender'
## Warning: Expecting logical in LS6358 / R6358C331: got 'Transgender Male (FTM)'
## Warning: Expecting logical in MO7225 / R7225C353: got 'solo ager'
#This command instructs R which sheet to use and to start on row 2 for variable names. Row one is the long form question that was asked in the survey.

#This data set contains 7605 observations and 406 variables

#I am interested in learning more about preparedness of individuals with disabilities and FNSS requirements. Using the Codebook provided with the data set I selected the following variables to investigate.
summary(fema_data$disability)
##    Length     Class      Mode 
##      7604 character character
#Unfortunately, these are yes or no questions and will not work easily for this assignment. As such, I will begin with a more basic study of preparedness vs perceived risk.

#level of self perceived preparedness
table(fema_data$dis_soc)
## 
##                                                             Don't know 
##                                                                    387 
##     I am NOT prepared, and I do not intend to prepare in the next year 
##                                                                    736 
## I am NOT prepared, but I intend to get prepared in the next six months 
##                                                                   1505 
##    I am NOT prepared, but I intend to start preparing in the next year 
##                                                                   1252 
##                              I have been prepared for LESS than a year 
##                                                                   1357 
##     I have been prepared for MORE than a year and I continue preparing 
##                                                                   2367
#is there a perceived threat from disaster
table(fema_data$dis_iperception)
## 
##      No Unknown     Yes 
##    1457     619    5528
#individual identifies as having a disability
table(fema_data$disability)
## 
##    Disability No Disability 
##          1855          5749
#Viewing levels of preparedness as factors will allow me to understand how the different responses are ordered from least to most prepared.
#I am also removing "Don't know" responses for preparedness, since those become NA when the ordered factor is created.
fema_data$dis_soc <- factor(fema_data$dis_soc,levels = c(
    "I am NOT prepared, and I do not intend to prepare in the next year",
    "I am NOT prepared, but I intend to start preparing in the next year",
    "I am NOT prepared, but I intend to get prepared in the next six months",
    "I have been prepared for LESS than a year",
    "I have been prepared for MORE than a year and I continue preparing"))

#Check the levels
levels(fema_data$dis_soc)
## [1] "I am NOT prepared, and I do not intend to prepare in the next year"    
## [2] "I am NOT prepared, but I intend to start preparing in the next year"   
## [3] "I am NOT prepared, but I intend to get prepared in the next six months"
## [4] "I have been prepared for LESS than a year"                             
## [5] "I have been prepared for MORE than a year and I continue preparing"
#I am not interested in data from individuals who answered "unknown" to whether they perceive a threat from disasters, have a disability, have experienced a disaster in the past, or whether they provide care for someone with disabilities.
fema_compare <- subset(fema_data,dis_iperception %in% c("Yes", "No") &! is.na(dis_soc) & disability %in% c("Disability", "No Disability") & dis_exp %in% c("Yes", "No") & care %in% c("Yes", "No"))

#Convert disability status to numeric
fema_compare$disability_numeric <- ifelse(fema_compare$disability == "Disability", 1, 0)

#Convert perceived threat to numeric
fema_compare$threat_numeric <- ifelse(fema_compare$dis_iperception == "Yes", 1, 0)

#Convert preparedness to numeric
fema_compare$prep_numeric <- as.numeric(fema_compare$dis_soc)

#turn disaster experience into a factor and remove "Don't know" responses
fema_compare$dis_exp <- factor(fema_compare$dis_exp,levels = c("No", "Yes"))

#turn care required or given into a factor and remove "Don't know" responses
fema_compare$care <- factor(fema_compare$care,levels = c("No", "Yes"))

#Verify that the above step worked
summary(fema_compare$prep_numeric)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   4.000   3.517   5.000   5.000
table(fema_compare$disability, useNA = "ifany")
## 
##    Disability No Disability 
##          1580          4977
#Check the relationship between disability and perceived disaster threat
table(fema_compare$disability, fema_compare$dis_iperception)
##                
##                   No  Yes
##   Disability     259 1321
##   No Disability 1064 3913
chisq.test(table(fema_compare$disability, fema_compare$dis_iperception))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(fema_compare$disability, fema_compare$dis_iperception)
## X-squared = 18.203, df = 1, p-value = 1.986e-05
#Adding the provided weight variable allows respondents to contribute proportionally based on how representative they are of the population.
#Weighted mean preparedness for respondents with disabilities
disability_subset <- subset(fema_compare, disability == "Disability" & !is.na(prep_numeric) & !is.na(weight))
weighted.mean(disability_subset$prep_numeric, disability_subset$weight)
## [1] 3.514663
#Weighted mean preparedness for respondents without disabilities
no_disability_subset <- subset(fema_compare, disability == "No Disability" & !is.na(prep_numeric) & !is.na(weight))
weighted.mean(no_disability_subset$prep_numeric, no_disability_subset$weight)
## [1] 3.523527
# Frequency table to identify the most common disability answer
fema_disability_mode_table<-fema_compare %>% group_by(disability) %>% summarize(count_of_x=n()) %>% arrange(-count_of_x)


head(fema_disability_mode_table)
## # A tibble: 2 × 2
##   disability    count_of_x
##   <chr>              <int>
## 1 No Disability       4977
## 2 Disability          1580
# Frequency table to identify the most common preparedness level
prep_mode_table <- fema_compare %>% group_by(prep_numeric) %>% summarize(count_of_prep = n()) %>% arrange(-count_of_prep)

prep_mode_table
## # A tibble: 5 × 2
##   prep_numeric count_of_prep
##          <dbl>         <int>
## 1            5          2232
## 2            3          1355
## 3            4          1254
## 4            2          1104
## 5            1           612
ggplot(fema_compare, aes(x = disability, y = prep_numeric)) + geom_boxplot() + labs(title = "Preparedness Levels by Disability Status",x = "Disability Status",y = "Preparedness Level (1–5)")

ggplot(fema_compare, aes(x = dis_iperception, y = prep_numeric)) + geom_boxplot() + labs(title = "Preparedness Levels by Perceived Disaster Risk",x = "Perceived Disaster Risk",y = "Preparedness Level (1–5)")

ggplot(fema_compare, aes(x = dis_exp, y = prep_numeric)) + geom_boxplot() + labs(title = "Preparedness Levels by Disaster Experience",x = "Previous Disaster Experience",y = "Preparedness Level (1–5)")

ggplot(fema_compare, aes(x = care, y = prep_numeric)) + geom_boxplot() + labs(title = "Preparedness Levels by Care Responsibilities",x = "Caregiver Status",y = "Preparedness Level (1–5)") + theme_minimal()

# Mean and median preparedness for respondents with disabilities
disability_mean <- mean(fema_compare$prep_numeric[fema_compare$disability == "Disability"], na.rm = TRUE)
disability_median <- median(fema_compare$prep_numeric[fema_compare$disability == "Disability"], na.rm = TRUE)

# Mean and median preparedness for respondents with no disabilities
no_disability_mean <- mean(fema_compare$prep_numeric[fema_compare$disability == "No Disability"], na.rm = TRUE)
no_disability_median <- median(fema_compare$prep_numeric[fema_compare$disability == "No Disability"], na.rm = TRUE)

#Weighted mean for respondents with disabilities
disability_weighted_mean <- weighted.mean(disability_subset$prep_numeric, disability_subset$weight)

#Weighted mean for respondents with no disabilities
no_disability_weighted_mean <- weighted.mean(no_disability_subset$prep_numeric, no_disability_subset$weight)
# Histogram of preparedness with Disability
hist(fema_compare$prep_numeric[fema_compare$disability == "Disability"],
     breaks = 6,
     probability = TRUE)

lines(density(fema_compare$prep_numeric[fema_compare$disability == "Disability"]),
      col = "red",
      lwd = 3)

abline(v = disability_mean, col = "blue", lwd = 5)
abline(v = disability_median, col = "green", lwd = 5)

# Histogram of preparedness with No Disability
hist(fema_compare$prep_numeric[fema_compare$disability == "No Disability"],
     breaks = 6,
     probability = TRUE)

lines(density(fema_compare$prep_numeric[fema_compare$disability == "No Disability"]),
      col = "red",
      lwd = 3)

abline(v = no_disability_mean, col = "blue", lwd = 5)
abline(v = no_disability_median, col = "green", lwd = 5)

#Null hypothesis: There is no difference in disaster preparedness levels between individuals with disabilities and those without disabilities.

#Alternative hypothesis: There is a difference in disaster preparedness levels between individuals with disabilities and those without disabilities.

t.test(prep_numeric ~ disability, data = fema_compare)
## 
##  Welch Two Sample t-test
## 
## data:  prep_numeric by disability
## t = -0.6884, df = 2574.5, p-value = 0.4913
## alternative hypothesis: true difference in means between group Disability and group No Disability is not equal to 0
## 95 percent confidence interval:
##  -0.10547117  0.05065904
## sample estimates:
##    mean in group Disability mean in group No Disability 
##                    3.496203                    3.523609
# Logistic model: Does having a disability effect perceived threat?
glm(threat_numeric ~ disability, data = fema_compare, family = "binomial")
## 
## Call:  glm(formula = threat_numeric ~ disability, family = "binomial", 
##     data = fema_compare)
## 
## Coefficients:
##             (Intercept)  disabilityNo Disability  
##                   1.629                   -0.327  
## 
## Degrees of Freedom: 6556 Total (i.e. Null);  6555 Residual
## Null Deviance:       6594 
## Residual Deviance: 6575  AIC: 6579
# Linear model: Does having a disability + perceiving threat change preparedness?
lm(prep_numeric ~ disability + threat_numeric, data = fema_compare)
## 
## Call:
## lm(formula = prep_numeric ~ disability + threat_numeric, data = fema_compare)
## 
## Coefficients:
##             (Intercept)  disabilityNo Disability           threat_numeric  
##                 2.93756                  0.06072                  0.66817
#What variables really effect preparedness?
disability_exp_perc_care_model<-lm(prep_numeric ~ dis_perception + dis_exp + disability_numeric + care, data = fema_compare)

summary(disability_exp_perc_care_model)
## 
## Call:
## lm(formula = prep_numeric ~ dis_perception + dis_exp + disability_numeric + 
##     care, data = fema_compare)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1143 -0.9830  0.1645  1.1081  2.3389 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                3.25561    0.03102 104.964  < 2e-16 ***
## dis_perceptionUnlikely    -0.42009    0.04305  -9.758  < 2e-16 ***
## dis_perceptionVery likely  0.22233    0.03784   5.875 4.43e-09 ***
## dis_expYes                 0.50506    0.03436  14.699  < 2e-16 ***
## disability_numeric        -0.17443    0.03949  -4.417 1.01e-05 ***
## careYes                    0.13124    0.04281   3.066  0.00218 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.295 on 6551 degrees of freedom
## Multiple R-squared:  0.08259,    Adjusted R-squared:  0.08189 
## F-statistic:   118 on 5 and 6551 DF,  p-value: < 2.2e-16
#Because testing below indicates that the model violates homoscedasticity and normality, a log model was created in an attempt to mitigate these issues.
disability_exp_perc_care_model_log <- lm(log(prep_numeric) ~ dis_perception + dis_exp + disability_numeric + care,data = fema_compare)
#Is this model legitimate?
#Linearity
plot(disability_exp_perc_care_model,which=1)

raintest(disability_exp_perc_care_model)
## 
##  Rainbow test
## 
## data:  disability_exp_perc_care_model
## Rain = 0.96138, df1 = 3279, df2 = 3272, p-value = 0.8701
#With a p-value of greater than 0.05 the model is likely linear.
#Independence of errors
durbinWatsonTest(disability_exp_perc_care_model)
##  lag Autocorrelation D-W Statistic p-value
##    1    -0.007347213      2.014606   0.562
##  Alternative hypothesis: rho != 0
#Although the model exhibits heteroscedasticity, the Durbin-Watson test indicates no evidence of autocorrelation with a p-value of greater than 0.5. These results suggest that while the variance of residuals is not constant, the independence assumption is still satisfied.
#Homoscedasticity
plot(disability_exp_perc_care_model,which=3)

bptest(disability_exp_perc_care_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  disability_exp_perc_care_model
## BP = 151.68, df = 5, p-value < 2.2e-16
#With a p-value less that 0.05, the model is likely heteroscedastic. 

#Homoscedasticity mitigation
plot(disability_exp_perc_care_model_log,which=3)

bptest(disability_exp_perc_care_model_log)
## 
##  studentized Breusch-Pagan test
## 
## data:  disability_exp_perc_care_model_log
## BP = 258.04, df = 5, p-value < 2.2e-16
#The log transformation was found to be ineffective, likely due to the limited range of preparedness levels (1-5). So, instead of transforming the dependent variable, robust standard errors are used to correct for heteroscedasticity. This approach keeps the original model intact while adjusting the standard errors and p-values to be more reliable in the presence of unequal variance across observations. 
coeftest(disability_exp_perc_care_model, vcov = vcovHC(disability_exp_perc_care_model, type = "HC1"))
## 
## t test of coefficients:
## 
##                            Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept)                3.255614   0.031627 102.9380 < 2.2e-16 ***
## dis_perceptionUnlikely    -0.420092   0.046877  -8.9616 < 2.2e-16 ***
## dis_perceptionVery likely  0.222325   0.036128   6.1538 8.012e-10 ***
## dis_expYes                 0.505065   0.034840  14.4967 < 2.2e-16 ***
## disability_numeric        -0.174431   0.040110  -4.3488 1.390e-05 ***
## careYes                    0.131242   0.042332   3.1003  0.001942 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#While heteroscedasticity remains present, the adjusted standard errors provide more reliable estimates of statistical significance. All variables 
#Normality of residuals
plot(disability_exp_perc_care_model,which=2)

#Due to >5,000 sample size, the Shapiro-Wilk test will not run. Instead, the K-S test was used. 
ks.test(disability_exp_perc_care_model$residuals,"pnorm")
## Warning in ks.test.default(disability_exp_perc_care_model$residuals, "pnorm"):
## ties should not be present for the one-sample Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  disability_exp_perc_care_model$residuals
## D = 0.18735, p-value < 2.2e-16
## alternative hypothesis: two-sided
#Normality of residuals mitigation
plot(disability_exp_perc_care_model_log,which=2)

#After applying the log transformation, the Q-Q plot shows the residuals deviate further from normality. This is likely due to the limited scale of the preparedness variable, which only ranges from 1 to 5. As a result, the transformation does not resolve the issue. Given the large sample size, minor deviations from normality are not considered a major concern.
#3e No multicolinarity
vif(disability_exp_perc_care_model)
##                        GVIF Df GVIF^(1/(2*Df))
## dis_perception     1.116341  2        1.027896
## dis_exp            1.118157  1        1.057429
## disability_numeric 1.114446  1        1.055673
## care               1.145472  1        1.070267
#A VIF less than 10 for all variables likely means that they are not strongly correlated with one another.
#Does disaster perception interact with another variable to have a synergistic effect on preparedness level?
interaction_model <- lm(prep_numeric ~ dis_perception * disability_numeric + dis_exp + care, 
data = fema_compare)

ggplot(fema_compare, aes(x = dis_perception, y = prep_numeric, fill = as.factor(disability_numeric))) + geom_boxplot() + labs(title = "Preparedness by Perceived Threat and Disability Status",x = "Perceived Threat",y = "Preparedness Level",fill = "Disability Status")

summary(interaction_model)
## 
## Call:
## lm(formula = prep_numeric ~ dis_perception * disability_numeric + 
##     dis_exp + care, data = fema_compare)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1418 -1.0045  0.1827  1.1081  2.2669 
## 
## Coefficients:
##                                              Estimate Std. Error t value
## (Intercept)                                   3.25097    0.03210 101.282
## dis_perceptionUnlikely                       -0.43364    0.04791  -9.051
## dis_perceptionVery likely                     0.24918    0.04416   5.643
## disability_numeric                           -0.15597    0.05561  -2.805
## dis_expYes                                    0.50435    0.03437  14.673
## careYes                                       0.13732    0.04298   3.195
## dis_perceptionUnlikely:disability_numeric     0.07172    0.10508   0.683
## dis_perceptionVery likely:disability_numeric -0.09393    0.08411  -1.117
##                                              Pr(>|t|)    
## (Intercept)                                   < 2e-16 ***
## dis_perceptionUnlikely                        < 2e-16 ***
## dis_perceptionVery likely                    1.74e-08 ***
## disability_numeric                            0.00505 ** 
## dis_expYes                                    < 2e-16 ***
## careYes                                       0.00140 ** 
## dis_perceptionUnlikely:disability_numeric     0.49493    
## dis_perceptionVery likely:disability_numeric  0.26412    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.295 on 6549 degrees of freedom
## Multiple R-squared:  0.08294,    Adjusted R-squared:  0.08196 
## F-statistic: 84.62 on 7 and 6549 DF,  p-value: < 2.2e-16
#The interactions were not statistically significant, suggesting that the relationship between perceived risk and preparedness is consistent regardless of disability status. As a result, the simpler additive model was retained for interpretation.