Lab_5_Assignment

Author

Chii Kojima

Essentials

#insert packages here :)
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.3.0      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(lubridate)

Attaching package: 'lubridate'

The following objects are masked from 'package:base':

    date, intersect, setdiff, union
library(patchwork)
library(ggplot2)

library(ggsci)
library(ggridges)
library(performance)
library(Hmisc)
Loading required package: lattice
Loading required package: survival
Loading required package: Formula

Attaching package: 'Hmisc'

The following objects are masked from 'package:dplyr':

    src, summarize

The following objects are masked from 'package:base':

    format.pval, units
library(corrplot)#to visualize correlation matrices
corrplot 0.92 loaded
library(car) #contains some statistical tests we need to assess assumptions
Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some
library(shiny)
faithful1<-faithful
faithful1
    eruptions waiting
1       3.600      79
2       1.800      54
3       3.333      74
4       2.283      62
5       4.533      85
6       2.883      55
7       4.700      88
8       3.600      85
9       1.950      51
10      4.350      85
11      1.833      54
12      3.917      84
13      4.200      78
14      1.750      47
15      4.700      83
16      2.167      52
17      1.750      62
18      4.800      84
19      1.600      52
20      4.250      79
21      1.800      51
22      1.750      47
23      3.450      78
24      3.067      69
25      4.533      74
26      3.600      83
27      1.967      55
28      4.083      76
29      3.850      78
30      4.433      79
31      4.300      73
32      4.467      77
33      3.367      66
34      4.033      80
35      3.833      74
36      2.017      52
37      1.867      48
38      4.833      80
39      1.833      59
40      4.783      90
41      4.350      80
42      1.883      58
43      4.567      84
44      1.750      58
45      4.533      73
46      3.317      83
47      3.833      64
48      2.100      53
49      4.633      82
50      2.000      59
51      4.800      75
52      4.716      90
53      1.833      54
54      4.833      80
55      1.733      54
56      4.883      83
57      3.717      71
58      1.667      64
59      4.567      77
60      4.317      81
61      2.233      59
62      4.500      84
63      1.750      48
64      4.800      82
65      1.817      60
66      4.400      92
67      4.167      78
68      4.700      78
69      2.067      65
70      4.700      73
71      4.033      82
72      1.967      56
73      4.500      79
74      4.000      71
75      1.983      62
76      5.067      76
77      2.017      60
78      4.567      78
79      3.883      76
80      3.600      83
81      4.133      75
82      4.333      82
83      4.100      70
84      2.633      65
85      4.067      73
86      4.933      88
87      3.950      76
88      4.517      80
89      2.167      48
90      4.000      86
91      2.200      60
92      4.333      90
93      1.867      50
94      4.817      78
95      1.833      63
96      4.300      72
97      4.667      84
98      3.750      75
99      1.867      51
100     4.900      82
101     2.483      62
102     4.367      88
103     2.100      49
104     4.500      83
105     4.050      81
106     1.867      47
107     4.700      84
108     1.783      52
109     4.850      86
110     3.683      81
111     4.733      75
112     2.300      59
113     4.900      89
114     4.417      79
115     1.700      59
116     4.633      81
117     2.317      50
118     4.600      85
119     1.817      59
120     4.417      87
121     2.617      53
122     4.067      69
123     4.250      77
124     1.967      56
125     4.600      88
126     3.767      81
127     1.917      45
128     4.500      82
129     2.267      55
130     4.650      90
131     1.867      45
132     4.167      83
133     2.800      56
134     4.333      89
135     1.833      46
136     4.383      82
137     1.883      51
138     4.933      86
139     2.033      53
140     3.733      79
141     4.233      81
142     2.233      60
143     4.533      82
144     4.817      77
145     4.333      76
146     1.983      59
147     4.633      80
148     2.017      49
149     5.100      96
150     1.800      53
151     5.033      77
152     4.000      77
153     2.400      65
154     4.600      81
155     3.567      71
156     4.000      70
157     4.500      81
158     4.083      93
159     1.800      53
160     3.967      89
161     2.200      45
162     4.150      86
163     2.000      58
164     3.833      78
165     3.500      66
166     4.583      76
167     2.367      63
168     5.000      88
169     1.933      52
170     4.617      93
171     1.917      49
172     2.083      57
173     4.583      77
174     3.333      68
175     4.167      81
176     4.333      81
177     4.500      73
178     2.417      50
179     4.000      85
180     4.167      74
181     1.883      55
182     4.583      77
183     4.250      83
184     3.767      83
185     2.033      51
186     4.433      78
187     4.083      84
188     1.833      46
189     4.417      83
190     2.183      55
191     4.800      81
192     1.833      57
193     4.800      76
194     4.100      84
195     3.966      77
196     4.233      81
197     3.500      87
198     4.366      77
199     2.250      51
200     4.667      78
201     2.100      60
202     4.350      82
203     4.133      91
204     1.867      53
205     4.600      78
206     1.783      46
207     4.367      77
208     3.850      84
209     1.933      49
210     4.500      83
211     2.383      71
212     4.700      80
213     1.867      49
214     3.833      75
215     3.417      64
216     4.233      76
217     2.400      53
218     4.800      94
219     2.000      55
220     4.150      76
221     1.867      50
222     4.267      82
223     1.750      54
224     4.483      75
225     4.000      78
226     4.117      79
227     4.083      78
228     4.267      78
229     3.917      70
230     4.550      79
231     4.083      70
232     2.417      54
233     4.183      86
234     2.217      50
235     4.450      90
236     1.883      54
237     1.850      54
238     4.283      77
239     3.950      79
240     2.333      64
241     4.150      75
242     2.350      47
243     4.933      86
244     2.900      63
245     4.583      85
246     3.833      82
247     2.083      57
248     4.367      82
249     2.133      67
250     4.350      74
251     2.200      54
252     4.450      83
253     3.567      73
254     4.500      73
255     4.150      88
256     3.817      80
257     3.917      71
258     4.450      83
259     2.000      56
260     4.283      79
261     4.767      78
262     4.533      84
263     1.850      58
264     4.250      83
265     1.983      43
266     2.250      60
267     4.750      75
268     4.117      81
269     2.150      46
270     4.417      90
271     1.817      46
272     4.467      74
cor.test(faithful1$eruptions, faithful1$waiting, method="spearman")
Warning in cor.test.default(faithful1$eruptions, faithful1$waiting, method =
"spearman"): Cannot compute exact p-value with ties

    Spearman's rank correlation rho

data:  faithful1$eruptions and faithful1$waiting
S = 744659, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.7779721 
#eruptions = length of eruption time in minutes
#waiting = time between eruptions in minutes

H0:The correlation between these two variables is 0

Ha:The correlation between these two variables is not 0

Since the correlation coefficient is 0.778, there is a strong positive relationship between the two values Additionally, the small p-value(p<0.001) suggests that the correlation is statistically significant.

#distribution of class of patients admitted to self poisoning (A) and gastro units (B)
#socioclass
socioclass<-c("I","II","III","IV","V")
selfpoison<-c(17,25,39,42,32)
gastro<-c(22,46,73,91,57)

patients<-data.frame(socioclass,selfpoison,gastro)
head(patients)
  socioclass selfpoison gastro
1          I         17     22
2         II         25     46
3        III         39     73
4         IV         42     91
5          V         32     57
str(patients)
'data.frame':   5 obs. of  3 variables:
 $ socioclass: chr  "I" "II" "III" "IV" ...
 $ selfpoison: num  17 25 39 42 32
 $ gastro    : num  22 46 73 91 57
#remove the categories so the data are clean for chi-square
pat2<-patients[,-1]
pat2
  selfpoison gastro
1         17     22
2         25     46
3         39     73
4         42     91
5         32     57
#perform a chisq test -- write a hypothesis and then perform the test!

#H0: There is no association between socioclass, and the selfpoisoning and gastro
#Ha: There is an association between socioclass, and the selfpoisoning and gastro

patientschi<-chisq.test(pat2)
patientschi

    Pearson's Chi-squared test

data:  pat2
X-squared = 1.9885, df = 4, p-value = 0.7379
#Since p-value is greater than 0.05, there is not enough evidence to reject the null hypothesis that there is no association between socioclass, and the selfpoisoning and gastro.
head(faithful)
  eruptions waiting
1     3.600      79
2     1.800      54
3     3.333      74
4     2.283      62
5     4.533      85
6     2.883      55
head(faithful1)
  eruptions waiting
1     3.600      79
2     1.800      54
3     3.333      74
4     2.283      62
5     4.533      85
6     2.883      55
#Simple Linear Regression
ggplot(faithful1, aes(x=eruptions, y=waiting))+
  geom_point()+
  geom_smooth(method='lm')+
  theme_classic()
`geom_smooth()` using formula = 'y ~ x'

#Summary of the regression
lm_faith<-lm(waiting~eruptions, data=faithful1)
summary(lm_faith)

Call:
lm(formula = waiting ~ eruptions, data = faithful1)

Residuals:
     Min       1Q   Median       3Q      Max 
-12.0796  -4.4831   0.2122   3.9246  15.9719 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  33.4744     1.1549   28.98   <2e-16 ***
eruptions    10.7296     0.3148   34.09   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.914 on 270 degrees of freedom
Multiple R-squared:  0.8115,    Adjusted R-squared:  0.8108 
F-statistic:  1162 on 1 and 270 DF,  p-value: < 2.2e-16
#Check_model
check_model(lm_faith)

eruptions = length of eruption time in minutes waiting = time between eruptions in minutes

Hypothesis

H0: There is no correlation between eruptions and waiting Ha: There is a correlation between eruptions and waiting

From the summary, the intercept is 33.4744 and slope is 10.7296. This means that, on average, as the length of eruption time in minutes increases by 1, the time between eruptions in minutes increases by 10.7296. Both coefficients are statistically significant (p<0.001), indicating a strong linear relation ship between the waiting time and the eruption duration. The adusted R-squared value of 0.8108 suggests that approximately 81% of the data in the data can be explained by this model.Therefore, we have enough evidence to reject our null hypothesis that there is no correlation between eruptions and waiting.

*Check Model 1. Linearity of the data: Although the linearity graph shows that it is not completely flat and horizontal, the linearity is good enough.

  1. Normality of residuals: From the graph we can see that the dots are following the line, our assumption is good.

  2. Homogeneity of residual variance: Although the line is not completely flat and horizontal, the Homogeneity of residual variance is good enough for liner regressions.

  3. Independence of residual error terms: There is no way to check this without understanding how the data collected.

library(palmerpenguins)
head(penguins)
# A tibble: 6 × 8
  species island    bill_length_mm bill_depth_mm flipper_l…¹ body_…² sex    year
  <fct>   <fct>              <dbl>         <dbl>       <int>   <int> <fct> <int>
1 Adelie  Torgersen           39.1          18.7         181    3750 male   2007
2 Adelie  Torgersen           39.5          17.4         186    3800 fema…  2007
3 Adelie  Torgersen           40.3          18           195    3250 fema…  2007
4 Adelie  Torgersen           NA            NA            NA      NA <NA>   2007
5 Adelie  Torgersen           36.7          19.3         193    3450 fema…  2007
6 Adelie  Torgersen           39.3          20.6         190    3650 male   2007
# … with abbreviated variable names ¹​flipper_length_mm, ²​body_mass_g
penguins1<-penguins

#a) Tell me what statistical test you are using
#T-test

#b) filter the data appropriately
pen_body<-penguins1 %>%
  select(species,body_mass_g) %>%
  filter(species=="Adelie"|species=="Gentoo") %>%
  drop_na(body_mass_g)
pen_body
# A tibble: 274 × 2
   species body_mass_g
   <fct>         <int>
 1 Adelie         3750
 2 Adelie         3800
 3 Adelie         3250
 4 Adelie         3450
 5 Adelie         3650
 6 Adelie         3625
 7 Adelie         4675
 8 Adelie         3475
 9 Adelie         4250
10 Adelie         3300
# … with 264 more rows
pen_body_adelie<-penguins1%>%
  select(species,body_mass_g) %>%
  filter(species=="Adelie")%>%
  drop_na(body_mass_g)
pen_body_adelie
# A tibble: 151 × 2
   species body_mass_g
   <fct>         <int>
 1 Adelie         3750
 2 Adelie         3800
 3 Adelie         3250
 4 Adelie         3450
 5 Adelie         3650
 6 Adelie         3625
 7 Adelie         4675
 8 Adelie         3475
 9 Adelie         4250
10 Adelie         3300
# … with 141 more rows
pen_body_gentoo<-penguins1%>%
  select(species,body_mass_g) %>%
  filter(species=="Gentoo")%>%
  drop_na(body_mass_g)
pen_body_gentoo
# A tibble: 123 × 2
   species body_mass_g
   <fct>         <int>
 1 Gentoo         4500
 2 Gentoo         5700
 3 Gentoo         4450
 4 Gentoo         5700
 5 Gentoo         5400
 6 Gentoo         4550
 7 Gentoo         4800
 8 Gentoo         5200
 9 Gentoo         4400
10 Gentoo         5150
# … with 113 more rows
#c) run your test
#Calculate means and error and plot!q

pen_body1<-pen_body %>%
  group_by(species) %>%
  dplyr::summarize(mean=mean(body_mass_g),sd=sd(body_mass_g),n=n(),se=sd/sqrt(n))
pen_body1
# A tibble: 2 × 5
  species  mean    sd     n    se
  <fct>   <dbl> <dbl> <int> <dbl>
1 Adelie  3701.  459.   151  37.3
2 Gentoo  5076.  504.   123  45.5
#dplyr <-to make r to use tydyverse package when you get error "Error in summarize(., mean = mean(body_mass_g), sd = sd(body_mass_g),  : argument "by" is missing, with no default"

#Make a graph
ggplot(pen_body1,aes(x=species,y=mean, color=species))+
  geom_point()+
  geom_errorbar(aes(x=species,ymin=mean-se, ymax=mean+se),width=0.2)+
  scale_color_aaas()+
  theme_classic()+
  labs(title="Body Mass")

#test "Does the body mass differ between species?"
#H0: The mean body mass is the same between species
#Ha: The mean body mass differ between species

t.test(data=pen_body, body_mass_g~species, alternative='two.sided', var.equal=FALSE) 

    Welch Two Sample t-test

data:  body_mass_g by species
t = -23.386, df = 249.64, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Adelie and group Gentoo is not equal to 0
95 percent confidence interval:
 -1491.183 -1259.525
sample estimates:
mean in group Adelie mean in group Gentoo 
            3700.662             5076.016 
#Since the p-value is < 2.2e-16, which is very small and indicates strong evidence against the null hypothesis that the mean body mass is the same between the two groups.

Depth

ggplot(pen_body1, aes(x = species, y = mean, color = species)) +
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.2) +
  geom_point() +
  geom_point(data = pen_body, aes(x = species, y = body_mass_g), 
             position = position_jitterdodge(dodge.width = 0.75), alpha = 0.3) +
  scale_color_aaas() +
  theme_classic() +
  labs(title = "Body Mass", y = "Mean Body Mass (g)", x = NULL)

penguins
# A tibble: 344 × 8
   species island    bill_length_mm bill_depth_mm flipper_…¹ body_…² sex    year
   <fct>   <fct>              <dbl>         <dbl>      <int>   <int> <fct> <int>
 1 Adelie  Torgersen           39.1          18.7        181    3750 male   2007
 2 Adelie  Torgersen           39.5          17.4        186    3800 fema…  2007
 3 Adelie  Torgersen           40.3          18          195    3250 fema…  2007
 4 Adelie  Torgersen           NA            NA           NA      NA <NA>   2007
 5 Adelie  Torgersen           36.7          19.3        193    3450 fema…  2007
 6 Adelie  Torgersen           39.3          20.6        190    3650 male   2007
 7 Adelie  Torgersen           38.9          17.8        181    3625 fema…  2007
 8 Adelie  Torgersen           39.2          19.6        195    4675 male   2007
 9 Adelie  Torgersen           34.1          18.1        193    3475 <NA>   2007
10 Adelie  Torgersen           42            20.2        190    4250 <NA>   2007
# … with 334 more rows, and abbreviated variable names ¹​flipper_length_mm,
#   ²​body_mass_g
pen_chin<-penguins%>%
  filter(species=="Chinstrap")%>%
  select(species,body_mass_g,sex,bill_length_mm)

pen_chin
# A tibble: 68 × 4
   species   body_mass_g sex    bill_length_mm
   <fct>           <int> <fct>           <dbl>
 1 Chinstrap        3500 female           46.5
 2 Chinstrap        3900 male             50  
 3 Chinstrap        3650 male             51.3
 4 Chinstrap        3525 female           45.4
 5 Chinstrap        3725 male             52.7
 6 Chinstrap        3950 female           45.2
 7 Chinstrap        3250 female           46.1
 8 Chinstrap        3750 male             51.3
 9 Chinstrap        4150 female           46  
10 Chinstrap        3700 male             51.3
# … with 58 more rows
ggplot(pen_chin,aes(x=body_mass_g,y=bill_length_mm, color=sex))+
  geom_point()+
  scale_color_aaas()+
  theme_classic()+
  labs(title="Body Mass vs Bill Length")+
  geom_smooth(method="lm")
`geom_smooth()` using formula = 'y ~ x'

lm1<-lm(bill_length_mm~body_mass_g*sex,data=pen_chin) 
lm1

Call:
lm(formula = bill_length_mm ~ body_mass_g * sex, data = pen_chin)

Coefficients:
        (Intercept)          body_mass_g              sexmale  
          35.982910             0.003003            11.056140  
body_mass_g:sexmale  
          -0.001973  
summary(lm1)

Call:
lm(formula = bill_length_mm ~ body_mass_g * sex, data = pen_chin)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.6911 -1.1108 -0.2264  0.7923 10.9076 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)         35.982910   5.196611   6.924 2.53e-09 ***
body_mass_g          0.003003   0.001469   2.044    0.045 *  
sexmale             11.056140   6.924654   1.597    0.115    
body_mass_g:sexmale -0.001973   0.001870  -1.055    0.295    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.407 on 64 degrees of freedom
Multiple R-squared:  0.5036,    Adjusted R-squared:  0.4803 
F-statistic: 21.64 on 3 and 64 DF,  p-value: 8.606e-10
anova(lm1)
Analysis of Variance Table

Response: bill_length_mm
                Df Sum Sq Mean Sq F value    Pr(>F)    
body_mass_g      1 197.10 197.101 34.0126 1.959e-07 ***
sex              1 172.66 172.661 29.7951 8.337e-07 ***
body_mass_g:sex  1   6.45   6.453  1.1136    0.2953    
Residuals       64 370.88   5.795                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#There is a significant correlation between body mass and bill length (p<0.001), sex and bill length(p<0.001). From the graph we can see that the as body mass increases, the bill length also increases. For sex and bill length, males have larger bill length than females. However, the pvalue for bodymass:sex is 0.2953 (>0.05) which suggests that sex doesn't change the correlation between body mass and bill length.
bigfoot <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')
Rows: 5021 Columns: 28
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (10): observed, location_details, county, state, season, title, classif...
dbl  (17): latitude, longitude, number, temperature_high, temperature_mid, t...
date  (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bigfoot
# A tibble: 5,021 × 28
   observed  locat…¹ county state season title latit…² longi…³ date       number
   <chr>     <chr>   <chr>  <chr> <chr>  <chr>   <dbl>   <dbl> <date>      <dbl>
 1 "I was c…  <NA>   Winst… Alab… Summer <NA>     NA      NA   NA          30680
 2 "Ed L. w… "East … Valde… Alas… Fall   <NA>     NA      NA   NA           1261
 3 "While a… "Great… Washi… Rhod… Fall   Repo…    41.4   -71.5 1974-09-20   6496
 4 "Hello, … "I wou… York … Penn… Summer <NA>     NA      NA   NA           8000
 5 "It was … "Loggi… Yamhi… Oreg… Spring <NA>     NA      NA   NA            703
 6 "My two … "The c… Washi… Okla… Fall   Repo…    35.3   -99.2 1973-09-28   9765
 7 "I was s… "Vince… Washi… Ohio  Summer Repo…    39.4   -81.7 1971-08-01   4983
 8 "Well la… "Both … Westc… New … Fall   Repo…    41.3   -73.7 2010-09-01  31940
 9 "I grew … "The W… Washo… Neva… Fall   Repo…    39.6  -120.  1970-09-01   5692
10 "heh i k… "the r… Warre… New … Fall   <NA>     NA      NA   NA            438
# … with 5,011 more rows, 18 more variables: classification <chr>,
#   geohash <chr>, temperature_high <dbl>, temperature_mid <dbl>,
#   temperature_low <dbl>, dew_point <dbl>, humidity <dbl>, cloud_cover <dbl>,
#   moon_phase <dbl>, precip_intensity <dbl>, precip_probability <dbl>,
#   precip_type <chr>, pressure <dbl>, summary <chr>, uv_index <dbl>,
#   visibility <dbl>, wind_bearing <dbl>, wind_speed <dbl>, and abbreviated
#   variable names ¹​location_details, ²​latitude, ³​longitude
#filter the data set
bigfoot_ow<-bigfoot %>%
  filter(state=="Ohio"|state=="Washington") %>%
  mutate(year=year(date))%>%
  group_by(state,year)%>%
  count()%>%
  drop_na() 

bigfoot_ow
# A tibble: 121 × 3
# Groups:   state, year [121]
   state  year     n
   <chr> <dbl> <int>
 1 Ohio   1956     1
 2 Ohio   1958     2
 3 Ohio   1962     1
 4 Ohio   1963     1
 5 Ohio   1967     1
 6 Ohio   1968     1
 7 Ohio   1970     1
 8 Ohio   1971     2
 9 Ohio   1972     2
10 Ohio   1973     1
# … with 111 more rows
#Null Hypothesis: There is no difference in the number of annual bigfoot sightings between Ohio and Washington
#Alternate Hypothesis: There is difference in the number of annual bigfoot sightings between Ohio and Washington

bigfoot_ow_test<-bigfoot_ow%>%
  group_by(state) %>%
  dplyr::summarize(mean=mean(n), sd=sd(n),n=n(),se=sd/sqrt(n))

bigfoot_ow_test
# A tibble: 2 × 5
  state       mean    sd     n    se
  <chr>      <dbl> <dbl> <int> <dbl>
1 Ohio        5.05  3.76    56 0.502
2 Washington  8.29  7.29    65 0.905
#Graph
ggplot(data=bigfoot_ow_test,aes(x=state,y=mean))+
  geom_point()+
  geom_errorbar(data=bigfoot_ow_test,aes(x=state,ymin=mean-se,ymax=mean+se))#The graph suggests the significant difference in the number of bigfoot sightings between Ohio and Washington, Washington has higher number of  bigfoot sightings than Ohio.

#T-test
t.test(data=bigfoot_ow, n~state, alternative='two.sided', var.equal=FALSE) #p-value<0.01 shows that there is a significant difference in the number of bigfoot sightings between Ohio and Washington. Graph also supports this. 

    Welch Two Sample t-test

data:  n by state
t = -3.1298, df = 98.619, p-value = 0.002301
alternative hypothesis: true difference in means between group Ohio and group Washington is not equal to 0
95 percent confidence interval:
 -5.292127 -1.185345
sample estimates:
      mean in group Ohio mean in group Washington 
                5.053571                 8.292308