Assignment 2

Author

Alison Band

Published

October 11, 2024

install.packages(c(“ggplot2”,“ggpubr”,“tidyverse”,“broom”, “tinytex”, “xlsx”, “readxl”, “quarto”, “rmarkdown”)) tinytex::install_tinytex()

r = getOption("repos")
r["CRAN"] = "http://cran.us.r-project.org"
options(repos = r)

library(ggplot2)
library(ggpubr)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(broom)
library(xlsx)
library(readxl)
library(tinytex)
library(quarto)

dep1 <- read_excel("medical1-2.xlsx")
sapply(dep1,class)

       Florida       New York North Carolina 
     "numeric"      "numeric"      "numeric"

colnames(dep1)[colnames(dep1) == 'New York'] <- 'NY'
colnames(dep1)[colnames(dep1) == 'North Carolina'] <- 'NC'
colnames(dep1)[colnames(dep1) == 'Florida'] <- 'FL'

print(dep1)

# A tibble: 20 × 3
      FL    NY    NC
   <dbl> <dbl> <dbl>
 1     3     8    10
 2     7    11     7
 3     7     9     3
 4     3     7     5
 5     8     8    11
 6     8     7     8
 7     8     8     4
 8     5     4     3
 9     5    13     7
10     2    10     8
11     6     6     8
12     2     8     7
13     6    12     3
14     6     8     9
15     9     6     8
16     7     8    12
17     5     5     6
18     4     7     3
19     7     7     8
20     3     8    11

summary(dep1)

       FL             NY              NC       
 Min.   :2.00   Min.   : 4.00   Min.   : 3.00  
 1st Qu.:3.75   1st Qu.: 7.00   1st Qu.: 4.75  
 Median :6.00   Median : 8.00   Median : 7.50  
 Mean   :5.55   Mean   : 8.00   Mean   : 7.05  
 3rd Qu.:7.00   3rd Qu.: 8.25   3rd Qu.: 8.25  
 Max.   :9.00   Max.   :13.00   Max.   :12.00

#One-Way Anova ## Pivot table so that Depression Score is the value based on the factor location.

dep1 <- data.frame(
  FL = c(3,7,7,3,8,8,8,5,5,2,6,2,6,6,9,7,5,4,7,3),
  NY = c(8,11,9,7,8,7,8,4,13,10,6,8,12,8,6,8,5,7,7,8),
  NC = c(10,7,3,5,11,8,4,3,7,8,8,7,3,9,8,12,6,3,8,11))

dep_long <- pivot_longer(dep1, cols = everything(), names_to = "Location", values_to = "Depression_score")

print(dep_long)

# A tibble: 60 × 2
   Location Depression_score
   <chr>               <dbl>
 1 FL                      3
 2 NY                      8
 3 NC                     10
 4 FL                      7
 5 NY                     11
 6 NC                      7
 7 FL                      7
 8 NY                      9
 9 NC                      3
10 FL                      3
# ℹ 50 more rows

summary(dep_long)

   Location         Depression_score
 Length:60          Min.   : 2.000  
 Class :character   1st Qu.: 5.000  
 Mode  :character   Median : 7.000  
                    Mean   : 6.867  
                    3rd Qu.: 8.000  
                    Max.   :13.000

anova_med1 <- aov(Depression_score ~ as.factor(Location), data = dep_long)

summary(anova_med1)

                    Df Sum Sq Mean Sq F value  Pr(>F)   
as.factor(Location)  2   61.0  30.517   5.241 0.00814 **
Residuals           57  331.9   5.823                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ggplot(dep_long, aes(x = Location, y = Depression_score, fill = Location)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Boxplot of Locations", x = "Location", y = "Depression_score")

tukey_test_med1 <- TukeyHSD(anova_med1)
plot(tukey_test_med1)

#One-Way Anova ##Pivot table so that Depression Score of those with health conditions is the value based on the factor location.

dep2 <- read_excel("medical2.xlsx")
sapply(dep2,class)

       Florida       New York North Carolina 
     "numeric"      "numeric"      "numeric"

print(dep2)

# A tibble: 20 × 3
   Florida `New York` `North Carolina`
     <dbl>      <dbl>            <dbl>
 1      13         14               10
 2      12          9               12
 3      17         15               15
 4      17         12               18
 5      20         16               12
 6      21         24               14
 7      16         18               17
 8      14         14                8
 9      13         15               14
10      17         17               16
11      12         20               18
12       9         11               17
13      12         23               19
14      15         19               15
15      16         17               13
16      15         14               14
17      13          9               11
18      10         14               12
19      11         13               13
20      17         11               11

summary(dep2)

    Florida        New York     North Carolina 
 Min.   : 9.0   Min.   : 9.00   Min.   : 8.00  
 1st Qu.:12.0   1st Qu.:12.75   1st Qu.:12.00  
 Median :14.5   Median :14.50   Median :14.00  
 Mean   :14.5   Mean   :15.25   Mean   :13.95  
 3rd Qu.:17.0   3rd Qu.:17.25   3rd Qu.:16.25  
 Max.   :21.0   Max.   :24.00   Max.   :19.00

colnames(dep2)[colnames(dep2) == 'New York'] <- 'NY'
colnames(dep2)[colnames(dep2) == 'North Carolina'] <- 'NC'
colnames(dep2)[colnames(dep2) == 'Florida'] <- 'FL'

print(dep2)

# A tibble: 20 × 3
      FL    NY    NC
   <dbl> <dbl> <dbl>
 1    13    14    10
 2    12     9    12
 3    17    15    15
 4    17    12    18
 5    20    16    12
 6    21    24    14
 7    16    18    17
 8    14    14     8
 9    13    15    14
10    17    17    16
11    12    20    18
12     9    11    17
13    12    23    19
14    15    19    15
15    16    17    13
16    15    14    14
17    13     9    11
18    10    14    12
19    11    13    13
20    17    11    11

dep2 <- data.frame(
  FL = c(13,12,17,17,20,21,15,14,13,17,12,9,12,15,16,15,13,10,11,17),
  NY = c(14,9,15,12,16,24,18,14,15,17,20,11,23,19,17,14,9,14,13,11),
  NC = c(10,12,15,18,12,14,17,8,14,16,18,17,19,15,13,14,11,12,13,11))

dep_long2 <- pivot_longer(dep2, cols = everything(), names_to = "Location2", values_to = "Depression_score_hc")

print(dep_long2)

# A tibble: 60 × 2
   Location2 Depression_score_hc
   <chr>                   <dbl>
 1 FL                         13
 2 NY                         14
 3 NC                         10
 4 FL                         12
 5 NY                          9
 6 NC                         12
 7 FL                         17
 8 NY                         15
 9 NC                         15
10 FL                         17
# ℹ 50 more rows

summary(dep_long2)

  Location2         Depression_score_hc
 Length:60          Min.   : 8.00      
 Class :character   1st Qu.:12.00      
 Mode  :character   Median :14.00      
                    Mean   :14.55      
                    3rd Qu.:17.00      
                    Max.   :24.00

anova_med2 <- aov(Depression_score_hc ~ as.factor(Location2), data = dep_long2)

summary(anova_med2)

                     Df Sum Sq Mean Sq F value Pr(>F)
as.factor(Location2)  2   17.2    8.60   0.723   0.49
Residuals            57  677.6   11.89

ggplot(dep_long2, aes(x = Location2, y = Depression_score_hc, fill = Location2)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Boxplot of Locations", x = "Location2", y = "Depression_score_hc")

tukey_test_med2 <- TukeyHSD(anova_med2)
plot(tukey_test_med2)

Use descriptive statistics to summarize the data from the two studies. What are your preliminary observations about the depression scores? As long as you don’t round the p value, the mean depression scored between the 3 locations are not equal. This implies that at least one location has a significantly different mean depression score. The post-hoc Tukey test indicates that the mean depression score between New York and Florida. However, with comparing the depression score with those that have chronic health conditions, the location is not statistically significant because p = 0.49 > 0.01. The post-hoc Tukey test further proves that because all the confidence intervals contain zero. What is not clear from this study is if the subjects are full or part-time residents of each location.

#Two-way ANOVA ##Now we are going to combine the two tables with the depression scores in the same order as the previous table.

Depression_score_hc <- c(13,12,17,17,20,21,15,14,13,17,12,9,12,15,16,15,13,10,11,17,14,9,15,12,16,24,18,14,15,17,20,11,23,19,17,14,9,14,13,11,10,12,15,18,12,14,17,8,14,16,18,17,19,15,13,14,11,12,13,11)

dep_long$Depression_score_hc <- Depression_score_hc

print(dep_long)

# A tibble: 60 × 3
   Location Depression_score Depression_score_hc
   <chr>               <dbl>               <dbl>
 1 FL                      3                  13
 2 NY                      8                  12
 3 NC                     10                  17
 4 FL                      7                  17
 5 NY                     11                  20
 6 NC                      7                  21
 7 FL                      7                  15
 8 NY                      9                  14
 9 NC                      3                  13
10 FL                      3                  17
# ℹ 50 more rows

two_way <- aov(Depression_score_hc + Depression_score ~ as.factor(Location), data = dep_long)
summary(two_way)

                    Df Sum Sq Mean Sq F value Pr(>F)  
as.factor(Location)  2  130.4   65.22   3.513 0.0364 *
Residuals           57 1058.1   18.56                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ggplot(dep_long, aes(x = Location, y = Depression_score_hc+Depression_score, fill = Location)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Boxplot of Locations", x = "Location", y = "Depression_score_combined")

tukey_test_combined <- TukeyHSD(two_way)
plot(tukey_test_combined)

Use analysis of variance on both data sets. State the hypotheses being tested in each case. What are your conclusions? There is not a significant difference in the means based solely on location for individuals with chronic health conditions. However, with the two depression scores combined, there is still a significant difference between the locations because p = 0.036 < 0.05. This is further proved because the confidence interval from the post-hoc Tukey test shows that there is a significant difference of the combined depressions scores between New York and Florida. H0: The mean depression scores between the three selected locations is not significantly different. H1: The mean depression scores (good health & good and poor health combined) is significantly different between the 3 selected locations. Post-hoc Tukey tests indicate that the significant difference is between the New York and Florida locations.
Use inferences about individual treatment means where appropriate. What are your conclusions? I am not too sure my two way anova is correct. Somehow I feel like my comparison is off, but when I tried putting location first, I got an error.I feel I’m overlooking something. However, based on the anova analysis, there is significant enough difference between depression scores in Florida and New York to warrant further investigation.