= getOption("repos")
r "CRAN"] = "http://cran.us.r-project.org"
r[options(repos = r)
Assignment 2
install.packages(c(“ggplot2”,“ggpubr”,“tidyverse”,“broom”, “tinytex”, “xlsx”, “readxl”, “quarto”, “rmarkdown”)) tinytex::install_tinytex()
library(ggplot2)
library(ggpubr)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ lubridate 1.9.3 ✔ tibble 3.2.1
✔ purrr 1.0.2 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(xlsx)
library(readxl)
library(tinytex)
library(quarto)
<- read_excel("medical1-2.xlsx")
dep1 sapply(dep1,class)
Florida New York North Carolina
"numeric" "numeric" "numeric"
colnames(dep1)[colnames(dep1) == 'New York'] <- 'NY'
colnames(dep1)[colnames(dep1) == 'North Carolina'] <- 'NC'
colnames(dep1)[colnames(dep1) == 'Florida'] <- 'FL'
print(dep1)
# A tibble: 20 × 3
FL NY NC
<dbl> <dbl> <dbl>
1 3 8 10
2 7 11 7
3 7 9 3
4 3 7 5
5 8 8 11
6 8 7 8
7 8 8 4
8 5 4 3
9 5 13 7
10 2 10 8
11 6 6 8
12 2 8 7
13 6 12 3
14 6 8 9
15 9 6 8
16 7 8 12
17 5 5 6
18 4 7 3
19 7 7 8
20 3 8 11
summary(dep1)
FL NY NC
Min. :2.00 Min. : 4.00 Min. : 3.00
1st Qu.:3.75 1st Qu.: 7.00 1st Qu.: 4.75
Median :6.00 Median : 8.00 Median : 7.50
Mean :5.55 Mean : 8.00 Mean : 7.05
3rd Qu.:7.00 3rd Qu.: 8.25 3rd Qu.: 8.25
Max. :9.00 Max. :13.00 Max. :12.00
#One-Way Anova ## Pivot table so that Depression Score is the value based on the factor location.
<- data.frame(
dep1 FL = c(3,7,7,3,8,8,8,5,5,2,6,2,6,6,9,7,5,4,7,3),
NY = c(8,11,9,7,8,7,8,4,13,10,6,8,12,8,6,8,5,7,7,8),
NC = c(10,7,3,5,11,8,4,3,7,8,8,7,3,9,8,12,6,3,8,11))
<- pivot_longer(dep1, cols = everything(), names_to = "Location", values_to = "Depression_score")
dep_long
print(dep_long)
# A tibble: 60 × 2
Location Depression_score
<chr> <dbl>
1 FL 3
2 NY 8
3 NC 10
4 FL 7
5 NY 11
6 NC 7
7 FL 7
8 NY 9
9 NC 3
10 FL 3
# ℹ 50 more rows
summary(dep_long)
Location Depression_score
Length:60 Min. : 2.000
Class :character 1st Qu.: 5.000
Mode :character Median : 7.000
Mean : 6.867
3rd Qu.: 8.000
Max. :13.000
<- aov(Depression_score ~ as.factor(Location), data = dep_long)
anova_med1
summary(anova_med1)
Df Sum Sq Mean Sq F value Pr(>F)
as.factor(Location) 2 61.0 30.517 5.241 0.00814 **
Residuals 57 331.9 5.823
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(dep_long, aes(x = Location, y = Depression_score, fill = Location)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Boxplot of Locations", x = "Location", y = "Depression_score")
<- TukeyHSD(anova_med1)
tukey_test_med1 plot(tukey_test_med1)
#One-Way Anova ##Pivot table so that Depression Score of those with health conditions is the value based on the factor location.
<- read_excel("medical2.xlsx")
dep2 sapply(dep2,class)
Florida New York North Carolina
"numeric" "numeric" "numeric"
print(dep2)
# A tibble: 20 × 3
Florida `New York` `North Carolina`
<dbl> <dbl> <dbl>
1 13 14 10
2 12 9 12
3 17 15 15
4 17 12 18
5 20 16 12
6 21 24 14
7 16 18 17
8 14 14 8
9 13 15 14
10 17 17 16
11 12 20 18
12 9 11 17
13 12 23 19
14 15 19 15
15 16 17 13
16 15 14 14
17 13 9 11
18 10 14 12
19 11 13 13
20 17 11 11
summary(dep2)
Florida New York North Carolina
Min. : 9.0 Min. : 9.00 Min. : 8.00
1st Qu.:12.0 1st Qu.:12.75 1st Qu.:12.00
Median :14.5 Median :14.50 Median :14.00
Mean :14.5 Mean :15.25 Mean :13.95
3rd Qu.:17.0 3rd Qu.:17.25 3rd Qu.:16.25
Max. :21.0 Max. :24.00 Max. :19.00
colnames(dep2)[colnames(dep2) == 'New York'] <- 'NY'
colnames(dep2)[colnames(dep2) == 'North Carolina'] <- 'NC'
colnames(dep2)[colnames(dep2) == 'Florida'] <- 'FL'
print(dep2)
# A tibble: 20 × 3
FL NY NC
<dbl> <dbl> <dbl>
1 13 14 10
2 12 9 12
3 17 15 15
4 17 12 18
5 20 16 12
6 21 24 14
7 16 18 17
8 14 14 8
9 13 15 14
10 17 17 16
11 12 20 18
12 9 11 17
13 12 23 19
14 15 19 15
15 16 17 13
16 15 14 14
17 13 9 11
18 10 14 12
19 11 13 13
20 17 11 11
<- data.frame(
dep2 FL = c(13,12,17,17,20,21,15,14,13,17,12,9,12,15,16,15,13,10,11,17),
NY = c(14,9,15,12,16,24,18,14,15,17,20,11,23,19,17,14,9,14,13,11),
NC = c(10,12,15,18,12,14,17,8,14,16,18,17,19,15,13,14,11,12,13,11))
<- pivot_longer(dep2, cols = everything(), names_to = "Location2", values_to = "Depression_score_hc")
dep_long2
print(dep_long2)
# A tibble: 60 × 2
Location2 Depression_score_hc
<chr> <dbl>
1 FL 13
2 NY 14
3 NC 10
4 FL 12
5 NY 9
6 NC 12
7 FL 17
8 NY 15
9 NC 15
10 FL 17
# ℹ 50 more rows
summary(dep_long2)
Location2 Depression_score_hc
Length:60 Min. : 8.00
Class :character 1st Qu.:12.00
Mode :character Median :14.00
Mean :14.55
3rd Qu.:17.00
Max. :24.00
<- aov(Depression_score_hc ~ as.factor(Location2), data = dep_long2)
anova_med2
summary(anova_med2)
Df Sum Sq Mean Sq F value Pr(>F)
as.factor(Location2) 2 17.2 8.60 0.723 0.49
Residuals 57 677.6 11.89
ggplot(dep_long2, aes(x = Location2, y = Depression_score_hc, fill = Location2)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Boxplot of Locations", x = "Location2", y = "Depression_score_hc")
<- TukeyHSD(anova_med2)
tukey_test_med2 plot(tukey_test_med2)
- Use descriptive statistics to summarize the data from the two studies. What are your preliminary observations about the depression scores? As long as you don’t round the p value, the mean depression scored between the 3 locations are not equal. This implies that at least one location has a significantly different mean depression score. The post-hoc Tukey test indicates that the mean depression score between New York and Florida. However, with comparing the depression score with those that have chronic health conditions, the location is not statistically significant because p = 0.49 > 0.01. The post-hoc Tukey test further proves that because all the confidence intervals contain zero. What is not clear from this study is if the subjects are full or part-time residents of each location.
#Two-way ANOVA ##Now we are going to combine the two tables with the depression scores in the same order as the previous table.
<- c(13,12,17,17,20,21,15,14,13,17,12,9,12,15,16,15,13,10,11,17,14,9,15,12,16,24,18,14,15,17,20,11,23,19,17,14,9,14,13,11,10,12,15,18,12,14,17,8,14,16,18,17,19,15,13,14,11,12,13,11)
Depression_score_hc
$Depression_score_hc <- Depression_score_hc
dep_long
print(dep_long)
# A tibble: 60 × 3
Location Depression_score Depression_score_hc
<chr> <dbl> <dbl>
1 FL 3 13
2 NY 8 12
3 NC 10 17
4 FL 7 17
5 NY 11 20
6 NC 7 21
7 FL 7 15
8 NY 9 14
9 NC 3 13
10 FL 3 17
# ℹ 50 more rows
<- aov(Depression_score_hc + Depression_score ~ as.factor(Location), data = dep_long)
two_way summary(two_way)
Df Sum Sq Mean Sq F value Pr(>F)
as.factor(Location) 2 130.4 65.22 3.513 0.0364 *
Residuals 57 1058.1 18.56
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(dep_long, aes(x = Location, y = Depression_score_hc+Depression_score, fill = Location)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Boxplot of Locations", x = "Location", y = "Depression_score_combined")
<- TukeyHSD(two_way)
tukey_test_combined plot(tukey_test_combined)
Use analysis of variance on both data sets. State the hypotheses being tested in each case. What are your conclusions? There is not a significant difference in the means based solely on location for individuals with chronic health conditions. However, with the two depression scores combined, there is still a significant difference between the locations because p = 0.036 < 0.05. This is further proved because the confidence interval from the post-hoc Tukey test shows that there is a significant difference of the combined depressions scores between New York and Florida. H0: The mean depression scores between the three selected locations is not significantly different. H1: The mean depression scores (good health & good and poor health combined) is significantly different between the 3 selected locations. Post-hoc Tukey tests indicate that the significant difference is between the New York and Florida locations.
Use inferences about individual treatment means where appropriate. What are your conclusions? I am not too sure my two way anova is correct. Somehow I feel like my comparison is off, but when I tried putting location first, I got an error.I feel I’m overlooking something. However, based on the anova analysis, there is significant enough difference between depression scores in Florida and New York to warrant further investigation.