# Lab 7 Assignment: Difference in Means and ANOVA
# The University of Texas at San Antonio
# URP-5393: Urban Planning Methods II
#---- Instructions ----
# 1. [70 points] Open the R file "Lab7_Assignment.R" and answer the questions bellow
# 2. [30 points] Run a T-test and an ANOVA test in your data.
#---- Part 1. Open the R file "Lab7_Assignment.R" and answer the questions bellow
# 1.1 load the same household data used in the Lab7_Script.R file, create the object `HTS`
library(data.table)
library(foreign)
HTS <- data.table(read.spss("datasets/HTS.household.10regions.sav",to.data.frame = T))
## re-encoding from CP1252
# 2. Recreate the same steps used in the lab to run a T-test, but instead, consider the following:
# 2.1 Run a T-Test to show if the household income means is statistically different between households living in single family residences or not (use the whole sample). Produce two pdfs, one with an histogram pdf plot, and another with the simulated hypothesis testing plot showing where the T-statistic falls. Provide a short interpretation of your results
HTS[,.N,by=htype]
## htype N
## <fctr> <int>
## 1: single family detached 10712
## 2: multi-family 1410
## 3: single family attached 934
## 4: <NA> 963
## 5: other 193
HTS<-HTS[htype%in%c("single family attached","single family detached")]
# Step 1: Normality
library(ggplot2)
p1<-ggplot(data=HTS,aes(x=hhincome))+
geom_histogram()+
facet_grid(htype~.)
ggsave(filename = "gabrielle-rodriguezbailon_plot_p1.pdf",plot = p1)
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 729 rows containing non-finite values (`stat_bin()`).
#Step 2: Variance
ggplot(data=HTS,aes(x=hhincome, y=htype))+
geom_boxplot()
## Warning: Removed 729 rows containing non-finite values (`stat_boxplot()`).

# #Step 3: dependent variable types
class(HTS$hhincome) ; class(HTS$htype)
## [1] "numeric"
## [1] "factor"
two_tailed_t_test<-HTS[,t.test(formula = hhincome ~ htype)] # two-tailed
two_tailed_t_test
##
## Welch Two Sample t-test
##
## data: hhincome by htype
## t = 21.426, df = 1106, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group single family detached and group single family attached is not equal to 0
## 95 percent confidence interval:
## 23.58036 28.33443
## sample estimates:
## mean in group single family detached mean in group single family attached
## 75.38215 49.42475
one_tailed_t_test<-HTS[,t.test(formula = lnhhincome ~ htype,alernative = 'greater')] # one-tailed
one_tailed_t_test
##
## Welch Two Sample t-test
##
## data: lnhhincome by htype
## t = 17.105, df = 964.05, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group single family detached and group single family attached is not equal to 0
## 95 percent confidence interval:
## 0.4305294 0.5421212
## sample estimates:
## mean in group single family detached mean in group single family attached
## 4.142698 3.656373
curve(dt(x, df = 964.05), from = -25, to = 25)
abline(h=0,col='blue')
points(x=two_tailed_t_test$statistic,y=0,col='red')
upper975 <- qt(p = .975, df = 964.05)
abline(v = upper975,y=0,col='red')
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "y" is not
## a graphical parameter
lower025 <- qt(p = .025, df = 964.05)
abline(v = lower025,y=0,col='red')
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "y" is not
## a graphical parameter

# Both T-test (one and and two-tailed are) statistically significant (p-value <0.05 & 0.025), which implies that the H0 can be rejected. Hence we can state that the difference in household income miles between household types exists and is not due to chance
# 2.2 Filter the sample to select only the region of San Antonio. Prepare an T-Test to show if the household vehicle miles traveled (in natural logs - lnvmt) is statistically different between households living in neighborhoods with a job-population index (variable `jobpop`) over and under the city median (of the `jobpop` variable of course)
HTS <- data.table(read.spss("datasets/HTS.household.10regions.sav",to.data.frame = T))
## re-encoding from CP1252
HTS[,.N,by=region]
## region N
## <fctr> <int>
## 1: Seattle, WA 3908
## 2: Kansas City, MO 3022
## 3: Eugene, OR 1674
## 4: San Antonio, TX 1563
## 5: Detroit, MI 939
## 6: Richmond, VA 612
## 7: Charleston, SC 243
## 8: Winston-Salem, NC 1459
## 9: Syracuse, NY 654
## 10: Madison, WI 138
median(HTS$jobpop)
## [1] NA
HTS[,over_under_jobpop:=as.numeric(jobpop>=0.6457728)]
HTS[,over_under_jobpop:=factor(over_under_jobpop,levels = c(0,1),labels = c("under median","over median"))]
HTS<-HTS[over_under_jobpop%in%c("under median", "over median")]
HTS<-HTS[region%in%c("San Antonio, TX")]
# Step 1: Normality
library(ggplot2)
ggplot(data=HTS,aes(x=lnvmt))+
geom_histogram()+
facet_grid(over_under_jobpop~.)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 43 rows containing non-finite values (`stat_bin()`).

#Step 2: Variance
ggplot(data=HTS,aes(x=lnvmt, y=over_under_jobpop))+
geom_boxplot()
## Warning: Removed 43 rows containing non-finite values (`stat_boxplot()`).

# #Step 3: dependent variable types
class(HTS$lnvmt) ; class(HTS$over_under_jobpop)
## [1] "numeric"
## [1] "factor"
two_tailed_t_test<-HTS[,t.test(formula = lnvmt ~ over_under_jobpop)] # two-tailed
two_tailed_t_test
##
## Welch Two Sample t-test
##
## data: lnvmt by over_under_jobpop
## t = 2.1612, df = 1512.7, p-value = 0.03084
## alternative hypothesis: true difference in means between group under median and group over median is not equal to 0
## 95 percent confidence interval:
## 0.01040175 0.21479104
## sample estimates:
## mean in group under median mean in group over median
## 3.103079 2.990483
one_tailed_t_test<-HTS[,t.test(formula = lnvmt ~ over_under_jobpop,alernative = 'greater')] # one-tailed
one_tailed_t_test
##
## Welch Two Sample t-test
##
## data: lnvmt by over_under_jobpop
## t = 2.1612, df = 1512.7, p-value = 0.03084
## alternative hypothesis: true difference in means between group under median and group over median is not equal to 0
## 95 percent confidence interval:
## 0.01040175 0.21479104
## sample estimates:
## mean in group under median mean in group over median
## 3.103079 2.990483
curve(dt(x, df = 1512.7), from = -25, to = 25)
abline(h=0,col='blue')
points(x=two_tailed_t_test$statistic,y=0,col='red')
upper975 <- qt(p = .975, df = 1512.7)
abline(v = upper975,y=0,col='red')
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "y" is not
## a graphical parameter
lower025 <- qt(p = .025, df = 1512.7)
abline(v = lower025,y=0,col='red')
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "y" is not
## a graphical parameter

# 2.2 using the same data set (San Antonio sample), run an ANOVA test to see if there are significant differences between income categories and vehicle miles traveled by household. Follow the same steps used in the ANOVA exercise done in class. Produce three pdfs: one checking the normality assumption of the dependent variable, a second one checking the presence of outliers, and a third one showing the Tukey (post hoc) T-tests plot.
HTS <- data.table(read.spss("datasets/HTS.household.10regions.sav",to.data.frame = T))
## re-encoding from CP1252
HTS[,.N,by=region]
## region N
## <fctr> <int>
## 1: Seattle, WA 3908
## 2: Kansas City, MO 3022
## 3: Eugene, OR 1674
## 4: San Antonio, TX 1563
## 5: Detroit, MI 939
## 6: Richmond, VA 612
## 7: Charleston, SC 243
## 8: Winston-Salem, NC 1459
## 9: Syracuse, NY 654
## 10: Madison, WI 138
HTS<-HTS[region%in%c("San Antonio, TX")]
ggplot(data=HTS[,.(Mean_lnvmt=mean(lnvmt)),by=income_cat], aes(x=income_cat,y=Mean_lnvmt))+
geom_point()
## Warning: Removed 3 rows containing missing values (`geom_point()`).

# shows independence within variables
hist(HTS$lnvmt)

#shows normality of dependent variable
ggplot(data=HTS, aes(x=income_cat, y=lnvmt))+
geom_boxplot()
## Warning: Removed 43 rows containing non-finite values (`stat_boxplot()`).

HTS[income_cat=="low (<35K)" & lnvmt<0, .(rhhnum, lnvmt)]
## rhhnum lnvmt
## <num> <num>
## 1: 150000094 -0.55056448
## 2: 150000242 -2.15561555
## 3: 150000400 -0.50098001
## 4: 150000529 -0.65287779
## 5: 150000552 -0.18906997
## 6: 150000667 -0.15533779
## 7: 150000759 -0.51702924
## 8: 150000766 -0.81596098
## 9: 150001173 -0.01811967
## 10: 150004217 -0.17584472
HTS[income_cat=="middle (35K-75K)" & lnvmt<1, .(rhhnum, lnvmt)]
## rhhnum lnvmt
## <num> <num>
## 1: 150000034 -1.617320225
## 2: 150000075 0.110661824
## 3: 150000123 0.547420044
## 4: 150000162 0.971593758
## 5: 150000201 0.962427632
## 6: 150000205 -0.001480761
## 7: 150000359 -0.415248432
## 8: 150000488 -0.057901070
## 9: 150000585 0.934789108
## 10: 150000656 0.630968751
## 11: 150000728 0.803340881
## 12: 150000740 0.786823621
## 13: 150000768 0.087722985
## 14: 150000828 0.820388367
## 15: 150000857 0.995648873
## 16: 150001060 0.673487713
## 17: 150001072 -0.475510803
## 18: 150001122 0.471652396
## 19: 150001374 -1.098307547
## 20: 155000211 0.816963848
## 21: 155000323 -1.613511253
## 22: 155000334 0.788930682
## rhhnum lnvmt
HTS[income_cat=="high (>75K)" & lnvmt<1.5, .(rhhnum, lnvmt)]
## rhhnum lnvmt
## <num> <num>
## 1: 150000113 1.3566209
## 2: 150000164 0.9730796
## 3: 150000315 1.4861585
## 4: 150001040 0.8109847
## 5: 150001169 0.7610517
## 6: 155000199 1.4268300
## 7: 155000223 1.0199762
HTS_bp<-boxplot(HTS$lnvmt~HTS$income_cat)

outliers <- HTS_bp$out
HTS2<-HTS[!lnvmt%in%outliers,]
boxplot(HTS2$lnvmt~HTS2$income_cat)

#shows presence of outliers
bartlett.test(lnvmt ~ income_cat, data=HTS2)
##
## Bartlett test of homogeneity of variances
##
## data: lnvmt by income_cat
## Bartlett's K-squared = 62.599, df = 2, p-value = 2.552e-14
fit<-aov(HTS2$lnvmt~HTS2$income_cat)
summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## HTS2$income_cat 2 139.1 69.54 93 <2e-16 ***
## Residuals 1482 1108.1 0.75
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 43 observations deleted due to missingness
TukeyHSD(fit)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = HTS2$lnvmt ~ HTS2$income_cat)
##
## $`HTS2$income_cat`
## diff lwr upr p adj
## middle (35K-75K)-low (<35K) 0.5946113 0.46854317 0.7206794 0.0000000
## high (>75K)-low (<35K) 0.7968258 0.65009509 0.9435565 0.0000000
## high (>75K)-middle (35K-75K) 0.2022145 0.07144679 0.3329822 0.0008622
plot(TukeyHSD(fit))

#shows Turkey (post hoc) T-test plot
# 2. [30 points] Run a T-test and an ANOVA test in your data.
#T-test
Crash_data <- read.csv("//Users/gabbyrodriguez/myrepo/07-lab-gabrielle-rodriguezbailon/my-datasets/my_list.csv")
setDT(Crash_data)
CF <- Crash_data[,.N,by=Contributing.Factors]
Crash_data<-Crash_data[Contributing.Factors%in%c("DRIVER INATTENTION", "FOLLOWED TOO CLOSELY")]
ggplot(data=Crash_data,aes(x=Crash.Not.Injured.Count))+
geom_histogram()+
facet_grid(Contributing.Factors~.)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=Crash_data,aes(x=Crash.Not.Injured.Count, y=Contributing.Factors))+
geom_boxplot()

class(Crash_data$Crash.Not.Injured.Count) ; class(Crash_data$Contributing.Factors)
## [1] "integer"
## [1] "character"
two_tailed_t_test<-Crash_data[,t.test(formula = Crash.Not.Injured.Count ~ Contributing.Factors)] # two-tailed
two_tailed_t_test
##
## Welch Two Sample t-test
##
## data: Crash.Not.Injured.Count by Contributing.Factors
## t = -1.6984, df = 531.88, p-value = 0.09002
## alternative hypothesis: true difference in means between group DRIVER INATTENTION and group FOLLOWED TOO CLOSELY is not equal to 0
## 95 percent confidence interval:
## -0.72954485 0.05298969
## sample estimates:
## mean in group DRIVER INATTENTION mean in group FOLLOWED TOO CLOSELY
## 1.969014 2.307292
# ANOVA
Crash_data <- read.csv("//Users/gabbyrodriguez/myrepo/07-lab-gabrielle-rodriguezbailon/my-datasets/my_list.csv")
setDT(Crash_data)
Crash_data<-Crash_data[Contributing.Factors%in%c("DRIVER INATTENTION", "FOLLOWED TOO CLOSELY","CHANGED LANE WHEN UNSAFE", "FAILED TO CONTROL SPEED")]
ggplot(data=Crash_data, aes(x=Contributing.Factors, y= Crash.Not.Injured.Count))+
geom_boxplot()

#shows boxplots for each reagion
Crash_data[Contributing.Factors=="DRIVER INATTENTION" & Crash.Not.Injured.Count>5, .(Crash.ID, Crash.Not.Injured.Count)]
## Crash.ID Crash.Not.Injured.Count
## <int> <int>
## 1: 18699659 6
## 2: 18717692 8
## 3: 18751687 49
## 4: 18858872 6
## 5: 18842955 7
## 6: 18917611 6
## 7: 19227395 7
Crash_data[Contributing.Factors=="FOLLOWED TOO CLOSELY" & Crash.Not.Injured.Count>5, .(Crash.ID, Crash.Not.Injured.Count)]
## Crash.ID Crash.Not.Injured.Count
## <int> <int>
## 1: 18742118 6
## 2: 18858449 7
## 3: 18886432 6
## 4: 18913881 6
## 5: 18935375 10
## 6: 18956156 7
## 7: 18961695 10
## 8: 18978575 7
## 9: 19001674 6
## 10: 19058077 8
## 11: 19168862 7
## 12: 19244311 8
Crash_data[Contributing.Factors=="CHANGED LANE WHEN UNSAFE" & Crash.Not.Injured.Count>5, .(Crash.ID, Crash.Not.Injured.Count)]
## Crash.ID Crash.Not.Injured.Count
## <int> <int>
## 1: 18772944 6
## 2: 18957010 6
## 3: 19041646 7
Crash_data[Contributing.Factors=="FAILED TO CONTROL SPEED" & Crash.Not.Injured.Count>5, .(Crash.ID, Crash.Not.Injured.Count)]
## Crash.ID Crash.Not.Injured.Count
## <int> <int>
## 1: 19022941 7
## 2: 19046364 6
## 3: 19127655 7
#this gives you the names of the outliers
# deleting outliers
Crash_data_bp<-boxplot(Crash_data$Crash.Not.Injured.Count~Crash_data$Contributing.Factors)

#creates a boxplot from the graphics library and shows the outliers
outliers <- Crash_data_bp$out
#creates an object that stores the outliers
Crash_data[Crash.Not.Injured.Count%in%outliers,]
## Crash.ID Active.School.Zone.Flag At.Intersection.Flag Bridge.Detail
## <int> <char> <lgcl> <char>
## 1: 18695208 NO FALSE NOT APPLICABLE
## 2: 18699659 NO FALSE NOT APPLICABLE
## 3: 18719993 NO FALSE NOT APPLICABLE
## 4: 18708655 NO FALSE NOT APPLICABLE
## 5: 18711692 NO FALSE NOT APPLICABLE
## ---
## 103: 19259401 NO FALSE NOT APPLICABLE
## 104: 19267704 NO FALSE NOT APPLICABLE
## 105: 19276859 NO FALSE NOT APPLICABLE
## 106: 19282593 NO FALSE NOT APPLICABLE
## 107: 19288814 NO FALSE NOT APPLICABLE
## Contributing.Factors Crash.Date
## <char> <char>
## 1: DRIVER INATTENTION 1/11/22
## 2: DRIVER INATTENTION 1/16/22
## 3: DRIVER INATTENTION 1/18/22
## 4: FOLLOWED TOO CLOSELY 1/21/22
## 5: FAILED TO CONTROL SPEED 1/24/22
## ---
## 103: CHANGED LANE WHEN UNSAFE 11/30/22
## 104: CHANGED LANE WHEN UNSAFE 12/3/22
## 105: DRIVER INATTENTION 12/10/22
## 106: FOLLOWED TOO CLOSELY 12/12/22
## 107: FAILED TO CONTROL SPEED 12/12/22
## Crash.Non.Suspected.Serious.Injury.Count Crash.Not.Injured.Count
## <int> <int>
## 1: 0 4
## 2: 0 6
## 3: 0 5
## 4: 0 4
## 5: 0 5
## ---
## 103: 0 4
## 104: 0 4
## 105: 0 4
## 106: 0 4
## 107: 0 4
## Crash.Number Crash.Severity Road.Base.Type Road.Class
## <int> <char> <char> <char>
## 1: 2022012643 C - POSSIBLE INJURY No Data CITY STREET
## 2: 2022016613 N - NOT INJURED No Data US & STATE HIGHWAYS
## 3: 2022035913 N - NOT INJURED No Data US & STATE HIGHWAYS
## 4: 2022024969 N - NOT INJURED No Data INTERSTATE
## 5: 2022027883 N - NOT INJURED No Data INTERSTATE
## ---
## 103: 2022581278 N - NOT INJURED No Data INTERSTATE
## 104: 2022589574 N - NOT INJURED No Data INTERSTATE
## 105: 2022598726 N - NOT INJURED No Data COUNTY ROAD
## 106: 2022604452 N - NOT INJURED No Data US & STATE HIGHWAYS
## 107: 2022610671 N - NOT INJURED No Data US & STATE HIGHWAYS
## Speed.Limit Street.Name
## <int> <char>
## 1: 45 FM2696
## 2: 65 SL1604
## 3: 45 SH0016
## 4: 45 NW LOOP 410
## 5: 45 IH0410
## ---
## 103: 65 IH0035
## 104: 65 IH0410
## 105: 40 TALLEY RD
## 106: 60 SL1604
## 107: 60 SL1604
#gives all of the information of the outliers inside of lntpm
Crash_data_2<-Crash_data[!Crash.Not.Injured.Count%in%outliers,]
#deletes those outliers
boxplot(Crash_data_2$Crash.Not.Injured.Count~Crash_data_2$Contributing.Factors)

# Step 1: dependent variable is normal:
hist(Crash_data_2$Crash.Not.Injured.Count) # looks pretty normal to me!

# Step 1: variance homogeneity?
bartlett.test(Crash.Not.Injured.Count ~ Contributing.Factors, data=Crash_data_2) # H0: variances are equal: p-value ==> accepts
##
## Bartlett test of homogeneity of variances
##
## data: Crash.Not.Injured.Count by Contributing.Factors
## Bartlett's K-squared = 6.3138, df = 3, p-value = 0.0973
# Step 4: one-way anova
fit<-aov(Crash_data_2$Crash.Not.Injured.Count~Crash_data_2$Contributing.Factors)
summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## Crash_data_2$Contributing.Factors 3 8.8 2.9213 3.48 0.0157 *
## Residuals 721 605.2 0.8394
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#the means across regions are different between each other because F value is extremely small
#post-hoc test
TukeyHSD(fit)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Crash_data_2$Crash.Not.Injured.Count ~ Crash_data_2$Contributing.Factors)
##
## $`Crash_data_2$Contributing.Factors`
## diff lwr
## DRIVER INATTENTION-CHANGED LANE WHEN UNSAFE -0.210837438 -0.44760333
## FAILED TO CONTROL SPEED-CHANGED LANE WHEN UNSAFE -0.259658520 -0.56367891
## FOLLOWED TOO CLOSELY-CHANGED LANE WHEN UNSAFE -0.005193699 -0.27490999
## FAILED TO CONTROL SPEED-DRIVER INATTENTION -0.048821082 -0.31661070
## FOLLOWED TOO CLOSELY-DRIVER INATTENTION 0.205643739 -0.02245658
## FOLLOWED TOO CLOSELY-FAILED TO CONTROL SPEED 0.254464821 -0.04285667
## upr p adj
## DRIVER INATTENTION-CHANGED LANE WHEN UNSAFE 0.02592845 0.1005702
## FAILED TO CONTROL SPEED-CHANGED LANE WHEN UNSAFE 0.04436187 0.1243740
## FOLLOWED TOO CLOSELY-CHANGED LANE WHEN UNSAFE 0.26452259 0.9999562
## FAILED TO CONTROL SPEED-DRIVER INATTENTION 0.21896854 0.9657367
## FOLLOWED TOO CLOSELY-DRIVER INATTENTION 0.43374406 0.0940975
## FOLLOWED TOO CLOSELY-FAILED TO CONTROL SPEED 0.55178632 0.1231212
plot(TukeyHSD(fit))
