# Lab 7 Assignment: Difference in Means and ANOVA
# The University of Texas at San Antonio
# URP-5393: Urban Planning Methods II


#---- Instructions ----

# 1. [70 points] Open the R file "Lab7_Assignment.R" and answer the questions bellow
# 2. [30 points] Run a T-test and an ANOVA test in your data.


#---- Part 1. Open the R file "Lab7_Assignment.R" and answer the questions bellow

# 1.1 load the same household data used in the Lab7_Script.R file, create the object `HTS`
library(data.table)
library(foreign)

HTS <- data.table(read.spss("datasets/HTS.household.10regions.sav",to.data.frame = T))
## re-encoding from CP1252
# 2. Recreate the same steps used in the lab to run a T-test, but instead, consider the following:
# 2.1 Run a T-Test to show if the household income means is statistically different between households living in single family residences or not (use the whole sample). Produce two pdfs, one with an histogram pdf plot, and another with the simulated hypothesis testing plot showing where the T-statistic falls. Provide a short interpretation of your results
HTS[,.N,by=htype]
##                     htype     N
##                    <fctr> <int>
## 1: single family detached 10712
## 2:           multi-family  1410
## 3: single family attached   934
## 4:                   <NA>   963
## 5:                  other   193
HTS<-HTS[htype%in%c("single family attached","single family detached")]

# Step 1: Normality
library(ggplot2)

p1<-ggplot(data=HTS,aes(x=hhincome))+
  geom_histogram()+
  facet_grid(htype~.)

ggsave(filename = "gabrielle-rodriguezbailon_plot_p1.pdf",plot = p1) 
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 729 rows containing non-finite values (`stat_bin()`).
#Step 2: Variance 
ggplot(data=HTS,aes(x=hhincome, y=htype))+
  geom_boxplot()
## Warning: Removed 729 rows containing non-finite values (`stat_boxplot()`).

# #Step 3: dependent variable types
class(HTS$hhincome) ; class(HTS$htype)
## [1] "numeric"
## [1] "factor"
two_tailed_t_test<-HTS[,t.test(formula = hhincome ~ htype)] # two-tailed
two_tailed_t_test
## 
##  Welch Two Sample t-test
## 
## data:  hhincome by htype
## t = 21.426, df = 1106, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group single family detached and group single family attached is not equal to 0
## 95 percent confidence interval:
##  23.58036 28.33443
## sample estimates:
## mean in group single family detached mean in group single family attached 
##                             75.38215                             49.42475
one_tailed_t_test<-HTS[,t.test(formula = lnhhincome ~ htype,alernative = 'greater')] # one-tailed
one_tailed_t_test
## 
##  Welch Two Sample t-test
## 
## data:  lnhhincome by htype
## t = 17.105, df = 964.05, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group single family detached and group single family attached is not equal to 0
## 95 percent confidence interval:
##  0.4305294 0.5421212
## sample estimates:
## mean in group single family detached mean in group single family attached 
##                             4.142698                             3.656373
curve(dt(x, df = 964.05), from = -25, to = 25)

abline(h=0,col='blue')
points(x=two_tailed_t_test$statistic,y=0,col='red')

upper975 <- qt(p = .975, df = 964.05)
abline(v = upper975,y=0,col='red')
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "y" is not
## a graphical parameter
lower025 <- qt(p = .025, df = 964.05)
abline(v = lower025,y=0,col='red')
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "y" is not
## a graphical parameter

# Both T-test (one and and two-tailed are) statistically significant (p-value <0.05 & 0.025), which implies that the H0 can be rejected. Hence we can state that the difference in household income  miles between household types exists and is not due to chance

# 2.2 Filter the sample to select only the region of San Antonio. Prepare an T-Test to show if the household vehicle miles traveled (in natural logs - lnvmt) is statistically different between households living in neighborhoods with a job-population index (variable `jobpop`) over and under the city median (of the `jobpop` variable of course)
HTS <- data.table(read.spss("datasets/HTS.household.10regions.sav",to.data.frame = T))
## re-encoding from CP1252
HTS[,.N,by=region]
##                region     N
##                <fctr> <int>
##  1:       Seattle, WA  3908
##  2:   Kansas City, MO  3022
##  3:        Eugene, OR  1674
##  4:   San Antonio, TX  1563
##  5:       Detroit, MI   939
##  6:      Richmond, VA   612
##  7:    Charleston, SC   243
##  8: Winston-Salem, NC  1459
##  9:      Syracuse, NY   654
## 10:       Madison, WI   138
median(HTS$jobpop)
## [1] NA
HTS[,over_under_jobpop:=as.numeric(jobpop>=0.6457728)]
HTS[,over_under_jobpop:=factor(over_under_jobpop,levels = c(0,1),labels = c("under median","over median"))]

HTS<-HTS[over_under_jobpop%in%c("under median", "over median")]

HTS<-HTS[region%in%c("San Antonio, TX")]

# Step 1: Normality
library(ggplot2)

ggplot(data=HTS,aes(x=lnvmt))+
  geom_histogram()+
  facet_grid(over_under_jobpop~.)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 43 rows containing non-finite values (`stat_bin()`).

#Step 2: Variance 
ggplot(data=HTS,aes(x=lnvmt, y=over_under_jobpop))+
  geom_boxplot()
## Warning: Removed 43 rows containing non-finite values (`stat_boxplot()`).

# #Step 3: dependent variable types
class(HTS$lnvmt) ; class(HTS$over_under_jobpop)
## [1] "numeric"
## [1] "factor"
two_tailed_t_test<-HTS[,t.test(formula = lnvmt ~ over_under_jobpop)] # two-tailed
two_tailed_t_test
## 
##  Welch Two Sample t-test
## 
## data:  lnvmt by over_under_jobpop
## t = 2.1612, df = 1512.7, p-value = 0.03084
## alternative hypothesis: true difference in means between group under median and group over median is not equal to 0
## 95 percent confidence interval:
##  0.01040175 0.21479104
## sample estimates:
## mean in group under median  mean in group over median 
##                   3.103079                   2.990483
one_tailed_t_test<-HTS[,t.test(formula = lnvmt ~ over_under_jobpop,alernative = 'greater')] # one-tailed
one_tailed_t_test
## 
##  Welch Two Sample t-test
## 
## data:  lnvmt by over_under_jobpop
## t = 2.1612, df = 1512.7, p-value = 0.03084
## alternative hypothesis: true difference in means between group under median and group over median is not equal to 0
## 95 percent confidence interval:
##  0.01040175 0.21479104
## sample estimates:
## mean in group under median  mean in group over median 
##                   3.103079                   2.990483
curve(dt(x, df = 1512.7), from = -25, to = 25)

abline(h=0,col='blue')
points(x=two_tailed_t_test$statistic,y=0,col='red')

upper975 <- qt(p = .975, df = 1512.7)
abline(v = upper975,y=0,col='red')
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "y" is not
## a graphical parameter
lower025 <- qt(p = .025, df = 1512.7)
abline(v = lower025,y=0,col='red')
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "y" is not
## a graphical parameter

# 2.2 using the same data set (San Antonio sample), run an ANOVA test to see if there are significant differences between income categories and vehicle miles traveled by household. Follow the same steps used in the ANOVA exercise done in class. Produce three pdfs: one checking the normality assumption of the dependent variable, a second one checking the presence of outliers, and a third one showing the Tukey (post hoc) T-tests plot.
HTS <- data.table(read.spss("datasets/HTS.household.10regions.sav",to.data.frame = T))
## re-encoding from CP1252
HTS[,.N,by=region]
##                region     N
##                <fctr> <int>
##  1:       Seattle, WA  3908
##  2:   Kansas City, MO  3022
##  3:        Eugene, OR  1674
##  4:   San Antonio, TX  1563
##  5:       Detroit, MI   939
##  6:      Richmond, VA   612
##  7:    Charleston, SC   243
##  8: Winston-Salem, NC  1459
##  9:      Syracuse, NY   654
## 10:       Madison, WI   138
HTS<-HTS[region%in%c("San Antonio, TX")]

ggplot(data=HTS[,.(Mean_lnvmt=mean(lnvmt)),by=income_cat], aes(x=income_cat,y=Mean_lnvmt))+
  geom_point()
## Warning: Removed 3 rows containing missing values (`geom_point()`).

# shows independence within variables

hist(HTS$lnvmt) 

#shows normality of dependent variable

ggplot(data=HTS, aes(x=income_cat, y=lnvmt))+
  geom_boxplot()
## Warning: Removed 43 rows containing non-finite values (`stat_boxplot()`).

HTS[income_cat=="low (<35K)" & lnvmt<0, .(rhhnum, lnvmt)]
##        rhhnum       lnvmt
##         <num>       <num>
##  1: 150000094 -0.55056448
##  2: 150000242 -2.15561555
##  3: 150000400 -0.50098001
##  4: 150000529 -0.65287779
##  5: 150000552 -0.18906997
##  6: 150000667 -0.15533779
##  7: 150000759 -0.51702924
##  8: 150000766 -0.81596098
##  9: 150001173 -0.01811967
## 10: 150004217 -0.17584472
HTS[income_cat=="middle (35K-75K)" & lnvmt<1, .(rhhnum, lnvmt)]
##        rhhnum        lnvmt
##         <num>        <num>
##  1: 150000034 -1.617320225
##  2: 150000075  0.110661824
##  3: 150000123  0.547420044
##  4: 150000162  0.971593758
##  5: 150000201  0.962427632
##  6: 150000205 -0.001480761
##  7: 150000359 -0.415248432
##  8: 150000488 -0.057901070
##  9: 150000585  0.934789108
## 10: 150000656  0.630968751
## 11: 150000728  0.803340881
## 12: 150000740  0.786823621
## 13: 150000768  0.087722985
## 14: 150000828  0.820388367
## 15: 150000857  0.995648873
## 16: 150001060  0.673487713
## 17: 150001072 -0.475510803
## 18: 150001122  0.471652396
## 19: 150001374 -1.098307547
## 20: 155000211  0.816963848
## 21: 155000323 -1.613511253
## 22: 155000334  0.788930682
##        rhhnum        lnvmt
HTS[income_cat=="high (>75K)" & lnvmt<1.5, .(rhhnum, lnvmt)]
##       rhhnum     lnvmt
##        <num>     <num>
## 1: 150000113 1.3566209
## 2: 150000164 0.9730796
## 3: 150000315 1.4861585
## 4: 150001040 0.8109847
## 5: 150001169 0.7610517
## 6: 155000199 1.4268300
## 7: 155000223 1.0199762
HTS_bp<-boxplot(HTS$lnvmt~HTS$income_cat)

outliers <- HTS_bp$out

HTS2<-HTS[!lnvmt%in%outliers,]

boxplot(HTS2$lnvmt~HTS2$income_cat)

#shows presence of outliers

bartlett.test(lnvmt ~ income_cat, data=HTS2)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  lnvmt by income_cat
## Bartlett's K-squared = 62.599, df = 2, p-value = 2.552e-14
fit<-aov(HTS2$lnvmt~HTS2$income_cat)
summary(fit)
##                   Df Sum Sq Mean Sq F value Pr(>F)    
## HTS2$income_cat    2  139.1   69.54      93 <2e-16 ***
## Residuals       1482 1108.1    0.75                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 43 observations deleted due to missingness
TukeyHSD(fit)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = HTS2$lnvmt ~ HTS2$income_cat)
## 
## $`HTS2$income_cat`
##                                   diff        lwr       upr     p adj
## middle (35K-75K)-low (<35K)  0.5946113 0.46854317 0.7206794 0.0000000
## high (>75K)-low (<35K)       0.7968258 0.65009509 0.9435565 0.0000000
## high (>75K)-middle (35K-75K) 0.2022145 0.07144679 0.3329822 0.0008622
plot(TukeyHSD(fit))

#shows Turkey (post hoc) T-test plot


# 2. [30 points] Run a T-test and an ANOVA test in your data.
#T-test
Crash_data <- read.csv("//Users/gabbyrodriguez/myrepo/07-lab-gabrielle-rodriguezbailon/my-datasets/my_list.csv")

setDT(Crash_data)

CF <- Crash_data[,.N,by=Contributing.Factors]

Crash_data<-Crash_data[Contributing.Factors%in%c("DRIVER INATTENTION", "FOLLOWED TOO CLOSELY")]

ggplot(data=Crash_data,aes(x=Crash.Not.Injured.Count))+
  geom_histogram()+
  facet_grid(Contributing.Factors~.)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=Crash_data,aes(x=Crash.Not.Injured.Count, y=Contributing.Factors))+
  geom_boxplot()

class(Crash_data$Crash.Not.Injured.Count) ; class(Crash_data$Contributing.Factors)
## [1] "integer"
## [1] "character"
two_tailed_t_test<-Crash_data[,t.test(formula = Crash.Not.Injured.Count ~ Contributing.Factors)] # two-tailed
two_tailed_t_test
## 
##  Welch Two Sample t-test
## 
## data:  Crash.Not.Injured.Count by Contributing.Factors
## t = -1.6984, df = 531.88, p-value = 0.09002
## alternative hypothesis: true difference in means between group DRIVER INATTENTION and group FOLLOWED TOO CLOSELY is not equal to 0
## 95 percent confidence interval:
##  -0.72954485  0.05298969
## sample estimates:
##   mean in group DRIVER INATTENTION mean in group FOLLOWED TOO CLOSELY 
##                           1.969014                           2.307292
# ANOVA
Crash_data <- read.csv("//Users/gabbyrodriguez/myrepo/07-lab-gabrielle-rodriguezbailon/my-datasets/my_list.csv")

setDT(Crash_data)

Crash_data<-Crash_data[Contributing.Factors%in%c("DRIVER INATTENTION", "FOLLOWED TOO CLOSELY","CHANGED LANE WHEN UNSAFE", "FAILED TO CONTROL SPEED")]


ggplot(data=Crash_data, aes(x=Contributing.Factors, y= Crash.Not.Injured.Count))+
  geom_boxplot()

#shows boxplots for each reagion

Crash_data[Contributing.Factors=="DRIVER INATTENTION" & Crash.Not.Injured.Count>5, .(Crash.ID, Crash.Not.Injured.Count)]
##    Crash.ID Crash.Not.Injured.Count
##       <int>                   <int>
## 1: 18699659                       6
## 2: 18717692                       8
## 3: 18751687                      49
## 4: 18858872                       6
## 5: 18842955                       7
## 6: 18917611                       6
## 7: 19227395                       7
Crash_data[Contributing.Factors=="FOLLOWED TOO CLOSELY" & Crash.Not.Injured.Count>5, .(Crash.ID, Crash.Not.Injured.Count)]
##     Crash.ID Crash.Not.Injured.Count
##        <int>                   <int>
##  1: 18742118                       6
##  2: 18858449                       7
##  3: 18886432                       6
##  4: 18913881                       6
##  5: 18935375                      10
##  6: 18956156                       7
##  7: 18961695                      10
##  8: 18978575                       7
##  9: 19001674                       6
## 10: 19058077                       8
## 11: 19168862                       7
## 12: 19244311                       8
Crash_data[Contributing.Factors=="CHANGED LANE WHEN UNSAFE" & Crash.Not.Injured.Count>5, .(Crash.ID, Crash.Not.Injured.Count)]
##    Crash.ID Crash.Not.Injured.Count
##       <int>                   <int>
## 1: 18772944                       6
## 2: 18957010                       6
## 3: 19041646                       7
Crash_data[Contributing.Factors=="FAILED TO CONTROL SPEED" & Crash.Not.Injured.Count>5, .(Crash.ID, Crash.Not.Injured.Count)]
##    Crash.ID Crash.Not.Injured.Count
##       <int>                   <int>
## 1: 19022941                       7
## 2: 19046364                       6
## 3: 19127655                       7
#this gives you the names of the outliers

# deleting outliers 
Crash_data_bp<-boxplot(Crash_data$Crash.Not.Injured.Count~Crash_data$Contributing.Factors)

#creates a boxplot from the graphics library and shows the outliers

outliers <- Crash_data_bp$out
#creates an object that stores the outliers

Crash_data[Crash.Not.Injured.Count%in%outliers,]
##      Crash.ID Active.School.Zone.Flag At.Intersection.Flag  Bridge.Detail
##         <int>                  <char>               <lgcl>         <char>
##   1: 18695208                      NO                FALSE NOT APPLICABLE
##   2: 18699659                      NO                FALSE NOT APPLICABLE
##   3: 18719993                      NO                FALSE NOT APPLICABLE
##   4: 18708655                      NO                FALSE NOT APPLICABLE
##   5: 18711692                      NO                FALSE NOT APPLICABLE
##  ---                                                                     
## 103: 19259401                      NO                FALSE NOT APPLICABLE
## 104: 19267704                      NO                FALSE NOT APPLICABLE
## 105: 19276859                      NO                FALSE NOT APPLICABLE
## 106: 19282593                      NO                FALSE NOT APPLICABLE
## 107: 19288814                      NO                FALSE NOT APPLICABLE
##          Contributing.Factors Crash.Date
##                        <char>     <char>
##   1:       DRIVER INATTENTION    1/11/22
##   2:       DRIVER INATTENTION    1/16/22
##   3:       DRIVER INATTENTION    1/18/22
##   4:     FOLLOWED TOO CLOSELY    1/21/22
##   5:  FAILED TO CONTROL SPEED    1/24/22
##  ---                                    
## 103: CHANGED LANE WHEN UNSAFE   11/30/22
## 104: CHANGED LANE WHEN UNSAFE    12/3/22
## 105:       DRIVER INATTENTION   12/10/22
## 106:     FOLLOWED TOO CLOSELY   12/12/22
## 107:  FAILED TO CONTROL SPEED   12/12/22
##      Crash.Non.Suspected.Serious.Injury.Count Crash.Not.Injured.Count
##                                         <int>                   <int>
##   1:                                        0                       4
##   2:                                        0                       6
##   3:                                        0                       5
##   4:                                        0                       4
##   5:                                        0                       5
##  ---                                                                 
## 103:                                        0                       4
## 104:                                        0                       4
## 105:                                        0                       4
## 106:                                        0                       4
## 107:                                        0                       4
##      Crash.Number      Crash.Severity Road.Base.Type          Road.Class
##             <int>              <char>         <char>              <char>
##   1:   2022012643 C - POSSIBLE INJURY        No Data         CITY STREET
##   2:   2022016613     N - NOT INJURED        No Data US & STATE HIGHWAYS
##   3:   2022035913     N - NOT INJURED        No Data US & STATE HIGHWAYS
##   4:   2022024969     N - NOT INJURED        No Data          INTERSTATE
##   5:   2022027883     N - NOT INJURED        No Data          INTERSTATE
##  ---                                                                    
## 103:   2022581278     N - NOT INJURED        No Data          INTERSTATE
## 104:   2022589574     N - NOT INJURED        No Data          INTERSTATE
## 105:   2022598726     N - NOT INJURED        No Data         COUNTY ROAD
## 106:   2022604452     N - NOT INJURED        No Data US & STATE HIGHWAYS
## 107:   2022610671     N - NOT INJURED        No Data US & STATE HIGHWAYS
##      Speed.Limit Street.Name
##            <int>      <char>
##   1:          45      FM2696
##   2:          65      SL1604
##   3:          45      SH0016
##   4:          45 NW LOOP 410
##   5:          45      IH0410
##  ---                        
## 103:          65      IH0035
## 104:          65      IH0410
## 105:          40   TALLEY RD
## 106:          60      SL1604
## 107:          60      SL1604
#gives all of the information of the outliers inside of lntpm
Crash_data_2<-Crash_data[!Crash.Not.Injured.Count%in%outliers,]
#deletes those outliers 

boxplot(Crash_data_2$Crash.Not.Injured.Count~Crash_data_2$Contributing.Factors)

# Step 1: dependent variable is normal: 

hist(Crash_data_2$Crash.Not.Injured.Count) # looks pretty normal to me!

# Step 1: variance homogeneity?
bartlett.test(Crash.Not.Injured.Count ~ Contributing.Factors, data=Crash_data_2) # H0: variances are equal: p-value ==> accepts
## 
##  Bartlett test of homogeneity of variances
## 
## data:  Crash.Not.Injured.Count by Contributing.Factors
## Bartlett's K-squared = 6.3138, df = 3, p-value = 0.0973
# Step 4: one-way anova

fit<-aov(Crash_data_2$Crash.Not.Injured.Count~Crash_data_2$Contributing.Factors)
summary(fit)
##                                    Df Sum Sq Mean Sq F value Pr(>F)  
## Crash_data_2$Contributing.Factors   3    8.8  2.9213    3.48 0.0157 *
## Residuals                         721  605.2  0.8394                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#the means across regions are different between each other because F value is extremely small

#post-hoc test
TukeyHSD(fit)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Crash_data_2$Crash.Not.Injured.Count ~ Crash_data_2$Contributing.Factors)
## 
## $`Crash_data_2$Contributing.Factors`
##                                                          diff         lwr
## DRIVER INATTENTION-CHANGED LANE WHEN UNSAFE      -0.210837438 -0.44760333
## FAILED TO CONTROL SPEED-CHANGED LANE WHEN UNSAFE -0.259658520 -0.56367891
## FOLLOWED TOO CLOSELY-CHANGED LANE WHEN UNSAFE    -0.005193699 -0.27490999
## FAILED TO CONTROL SPEED-DRIVER INATTENTION       -0.048821082 -0.31661070
## FOLLOWED TOO CLOSELY-DRIVER INATTENTION           0.205643739 -0.02245658
## FOLLOWED TOO CLOSELY-FAILED TO CONTROL SPEED      0.254464821 -0.04285667
##                                                         upr     p adj
## DRIVER INATTENTION-CHANGED LANE WHEN UNSAFE      0.02592845 0.1005702
## FAILED TO CONTROL SPEED-CHANGED LANE WHEN UNSAFE 0.04436187 0.1243740
## FOLLOWED TOO CLOSELY-CHANGED LANE WHEN UNSAFE    0.26452259 0.9999562
## FAILED TO CONTROL SPEED-DRIVER INATTENTION       0.21896854 0.9657367
## FOLLOWED TOO CLOSELY-DRIVER INATTENTION          0.43374406 0.0940975
## FOLLOWED TOO CLOSELY-FAILED TO CONTROL SPEED     0.55178632 0.1231212
plot(TukeyHSD(fit))