# Lab 7 Assignment: Difference in Means and ANOVA
# The University of Texas at San Antonio
# URP-5393: Urban Planning Methods II


#---- Instructions ----

# 1. [70 points] Open the R file "Lab7_Assignment.R" and answer the questions bellow
# 2. [30 points] Run a T-test and an ANOVA test in your data.


#---- Part 1. Open the R file "Lab7_Assignment.R" and answer the questions bellow

# 1.1 load the same household data used in the Lab7_Script.R file, create the object `HTS`
library(data.table)
library(foreign)
HTS <- data.table(read.spss("datasets/HTS.household.10regions.sav",to.data.frame = T))
## re-encoding from CP1252
# 2. Recreate the same steps used in the lab to run a T-test, but instead, consider the following:
# 2.1 Run a T-Test to show if the household income means is statistically different between households living in single family residences or not (use the whole sample). Produce two pdfs, one with an histogram pdf plot, and another with the simulated hypothesis testing plot showing where the T-statistic falls. Provide a short interpretation of your results
library(ggplot2)
ggplot(data=HTS[!is.na(sf),],aes(x=hhincome))+
  geom_histogram()+
  facet_grid(sf~.)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 831 rows containing non-finite values (`stat_bin()`).

ggsave("income_histogram.pdf", device = "pdf")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 831 rows containing non-finite values (`stat_bin()`).
# Cleaning the data by removing NA from both sf and hhincome.
HTS<-HTS[!is.na(sf),]
HTS<-HTS[!is.na(hhincome),]
# Checking variables types
class(HTS$sf)  #factor with 2 levels
## [1] "factor"
class(HTS$hhincome)  #numeric
## [1] "numeric"
# 2 tailed test
two_tailed_t_test<-HTS[,t.test(formula = hhincome ~ sf)] 
two_tailed_t_test
## 
##  Welch Two Sample t-test
## 
## data:  hhincome by sf
## t = -36.234, df = 4353.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group other and group single family detached is not equal to 0
## 95 percent confidence interval:
##  -30.19913 -27.09895
## sample estimates:
##                  mean in group other mean in group single family detached 
##                             46.73311                             75.38215
# 1 tailed test
one_tailed_t_test<-HTS[,t.test(formula = hhincome ~ sf,alternative = 'greater')]
one_tailed_t_test
## 
##  Welch Two Sample t-test
## 
## data:  hhincome by sf
## t = -36.234, df = 4353.4, p-value = 1
## alternative hypothesis: true difference in means between group other and group single family detached is greater than 0
## 95 percent confidence interval:
##  -29.94983       Inf
## sample estimates:
##                  mean in group other mean in group single family detached 
##                             46.73311                             75.38215
# I noticed that there was a typo in the Lab7_Script, in line 57, "alernative" should be "alternative", otherwise it gives the same results as 2 tailed test. Because R ignores that argument and proceed with defaults
# The P value of 2-tailed test is < 2.2e-16, which is good (against the null hypothesis of equal means between the two groups).
# But 1_tailed test gives p value = 1, and t = -36.234, which means the mean of the first group (other) is less than the mean of the second group (single family detached), which is opposite to the specified alternative hypothesis 'greater'
# 1_tailed test can not reject the null hypothesis(which is the mean household income (hhincome) for households living in single-family detached homes (sf) is less than or equal to the mean household income for households not living in single-family detached homes.)
# Which means we can accept the alternative hypothesis that the mean household income for the single-family detached homes is greater than the mean for other types of homes.

# Simulated hypothesis testing plot
# x=two_tailed_t_test$statistic= -36.23447, there for we put -40 to 40 range.
pdf("hypothesis_testing_plot.pdf") # This opens a pdf file, start mapping.
curve(dt(x, df = 4353.4), from = -40, to = 40)
abline(h=0,col='blue')
points(x=two_tailed_t_test$statistic,y=0,col='red')
upper975 <- qt(p = .975, df = 4353.4)
abline(v = upper975,y=0,col='red')
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "y" is not
## a graphical parameter
lower025 <- qt(p = .025, df = 4353.4)
abline(v = lower025,y=0,col='red')
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "y" is not
## a graphical parameter
dev.off() # Save it and close
## png 
##   2
# 2.2 Filter the sample to select only the region of San Antonio. Prepare an T-Test to show if the household vehicle miles traveled (in natural logs - lnvmt) is statistically different between households living in neighborhoods with a job-population index (variable `jobpop`) over and under the city median (of the `jobpop` variable of course)
HTS_SA<-HTS[region=="San Antonio, TX",]
HTS_SA[,above_median_jobpop:=factor(as.numeric(jobpop>quantile(jobpop,0.5)))]
T_test_2tailed<-HTS_SA[,t.test(formula = lnvmt ~ above_median_jobpop )] 
T_test_2tailed
## 
##  Welch Two Sample t-test
## 
## data:  lnvmt by above_median_jobpop
## t = 2.2052, df = 1512.2, p-value = 0.02759
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.01269755 0.21709014
## sample estimates:
## mean in group 0 mean in group 1 
##        3.104152        2.989259
# 2.2 using the same data set (San Antonio sample), run an ANOVA test to see if there are significant differences between income categories and vehicle miles traveled by household. Follow the same steps used in the ANOVA exercise done in class. Produce three pdfs: one checking the normality assumption of the dependent variable, a second one checking the presence of outliers, and a third one showing the Tukey (post hoc) T-tests plot.
#Normality
ggplot(data=HTS_SA, aes(x=income_cat, y= lnvmt))+
  geom_boxplot() 
## Warning: Removed 43 rows containing non-finite values (`stat_boxplot()`).

ggsave("normality_check.pdf", device = "pdf")
## Saving 7 x 5 in image
## Warning: Removed 43 rows containing non-finite values (`stat_boxplot()`).
#Outlier
HTS_SA_bp<-boxplot(HTS_SA$lnvmt~HTS_SA$income_cat)
outliers <- HTS_SA_bp$out
HTS_SA2 <- HTS_SA[!lnvmt%in%outliers,]
boxplot(HTS_SA$lnvmt~HTS_SA$income_cat)

boxplot(HTS_SA2$lnvmt~HTS_SA2$income_cat)

#Dependent variable Normality 
hist(HTS_SA2$lnvmt) # looks normal

#Homogeneity
bartlett.test(lnvmt ~ income_cat, data=HTS_SA2) #Reject H0
## 
##  Bartlett test of homogeneity of variances
## 
## data:  lnvmt by income_cat
## Bartlett's K-squared = 62.599, df = 2, p-value = 2.552e-14
#one-way anova
fit<-aov(HTS_SA2$lnvmt~HTS_SA2$income_cat)
summary(fit)
##                      Df Sum Sq Mean Sq F value Pr(>F)    
## HTS_SA2$income_cat    2  139.1   69.54      93 <2e-16 ***
## Residuals          1482 1108.1    0.75                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 43 observations deleted due to missingness
#post-hoc test
TukeyHSD(fit)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = HTS_SA2$lnvmt ~ HTS_SA2$income_cat)
## 
## $`HTS_SA2$income_cat`
##                                   diff        lwr       upr     p adj
## middle (35K-75K)-low (<35K)  0.5946113 0.46854317 0.7206794 0.0000000
## high (>75K)-low (<35K)       0.7968258 0.65009509 0.9435565 0.0000000
## high (>75K)-middle (35K-75K) 0.2022145 0.07144679 0.3329822 0.0008622
plot(TukeyHSD(fit))

# 2. [30 points] Run a T-test and an ANOVA test in your data.
#Step1 prepare the data.
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(mgcv)
## Warning: package 'mgcv' was built under R version 4.3.2
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
dir.create("my_datasets")
## Warning in dir.create("my_datasets"): 'my_datasets' already exists
file.copy("E:/UTSA/redlining/Climate Disaster/flood_risk/flood_v2.1_summary_fsf_flood_tract_summary.csv", 
          "my_datasets/flood_v2.1_summary_fsf_flood_tract_summary.csv")
## [1] FALSE
file.copy("E:/UTSA/redlining/Climate Disaster/Heat/heat_v1.1_summary_fsf_heat_tract_summary.csv", 
          "my_datasets/heat_v1.1_summary_fsf_heat_tract_summary.csv")
## [1] FALSE
file.copy("E:/UTSA/redlining/holc_tract_lookup.csv", 
          "my_datasets/holc_tract_lookup.csv")
## [1] FALSE
HOLC<- read_csv("my_datasets/holc_tract_lookup.csv")
## Rows: 41171 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): geoid, state, city, name, holc_id
## dbl (5): cartodb_id, polygon_id, tract_prop, holc_prop, neighborhood_id
## lgl (1): the_geom
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Flood<- read_csv("my_datasets/flood_v2.1_summary_fsf_flood_tract_summary.csv")
## Rows: 84784 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): fips
## dbl (11): count_property, count_floodfactor1, count_floodfactor2, count_floo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Heat<- read_csv("my_datasets/heat_v1.1_summary_fsf_heat_tract_summary.csv")
## Rows: 83241 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): fips
## dbl (11): count_property, count_heatfactor1, count_heatfactor2, count_heatfa...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
HOLC$holc_grade <- gsub("[0-9]", "", HOLC$holc_id)
HOLC$holc_grade <- factor(HOLC$holc_grade, levels = c("A", "B", "C", "D"))
Flood <- Flood %>%
  mutate(Floodhigh_risk_sum = rowSums(select(., count_floodfactor5:count_floodfactor10), na.rm = TRUE),
         Floodtotal_properties = count_property,
         Floodpercentage_high_risk = (Floodhigh_risk_sum / Floodtotal_properties) * 100)
Heat  <- Heat %>%
  mutate(Heathigh_risk_sum = rowSums(select(., count_heatfactor5:count_heatfactor10), na.rm = TRUE),
         Heattotal_properties = count_property,
         Heatpercentage_high_risk = (Heathigh_risk_sum / Heattotal_properties) * 100)
combined_data <- left_join(HOLC, Flood, by = c("geoid" = "fips"))
combined_data <- left_join(combined_data, Heat, by = c("geoid" = "fips"))

#temp_data <- combined_data2[!is.na(combined_data2$holc_grade), ]
#summary(temp_data)

#Now we have our data "combined_data", we will use variables named holc_grade, Flood and Heat.

# Cleaning the data by removing NA.
#combined_data2<-combined_data2[!is.na(combined_data$holc_grade),]
#combined_data2<-combined_data2[!is.na(combined_data$Floodpercentage_high_risk),]
#I try Tidyverse instead
combined_data2 <- combined_data %>% 
  filter(!is.na(holc_grade))
combined_data2 <- combined_data %>% 
  filter(!is.na(Floodpercentage_high_risk))
combined_data2 <- combined_data %>% 
  filter(!is.na(Heatpercentage_high_risk))
# Checking variables types (Not suitable for 2 or 1 tailed test since we got 4 levels factor in HOLC)
class(combined_data2$holc_grade)  #factor with 4 levels
## [1] "factor"
class(combined_data2$Floodpercentage_high_risk)  #numeric
## [1] "numeric"
class(combined_data2$Heatpercentage_high_risk)  #numeric
## [1] "numeric"
# Therefore do ANOVA
#Normality(>30, No need)
#Outlier
combined_data2_bp<-boxplot(combined_data2$Floodpercentage_high_risk~combined_data2$holc_grade)
outliers <- combined_data2_bp$out
combined_data3 <- combined_data2[!combined_data2$Floodpercentage_high_risk %in% outliers, ]
boxplot(combined_data2$Floodpercentage_high_risk~combined_data2$holc_grade)

boxplot(combined_data3$Floodpercentage_high_risk~combined_data3$holc_grade)

#Dependent variable Normality(No need)
#Homogeneity
bartlett.test(Floodpercentage_high_risk ~ holc_grade, data=combined_data3) #p-value < 2.2e-16, Reject H0
## 
##  Bartlett test of homogeneity of variances
## 
## data:  Floodpercentage_high_risk by holc_grade
## Bartlett's K-squared = 1032.1, df = 3, p-value < 2.2e-16
#one-way anova
fit_flood<-aov(combined_data3$Floodpercentage_high_risk~combined_data3$holc_grade)
summary(fit_flood)
##                              Df  Sum Sq Mean Sq F value Pr(>F)    
## combined_data3$holc_grade     3   22671    7557   153.8 <2e-16 ***
## Residuals                 31956 1569697      49                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1415 observations deleted due to missingness
#post-hoc test
TukeyHSD(fit_flood)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = combined_data3$Floodpercentage_high_risk ~ combined_data3$holc_grade)
## 
## $`combined_data3$holc_grade`
##          diff        lwr       upr     p adj
## B-A 0.2142019 -0.1847979 0.6132016 0.5124401
## C-A 1.1701430  0.7959953 1.5442908 0.0000000
## D-A 2.4063990  2.0090262 2.8037717 0.0000000
## C-B 0.9559412  0.6984714 1.2134110 0.0000000
## D-B 2.1921971  1.9020088 2.4823854 0.0000000
## D-C 1.2362559  0.9813147 1.4911971 0.0000000
plot(TukeyHSD(fit_flood))

# Bonus: [30 points] Provide an HTML file with your answers using R-Markdown.