Introduction

This report analyzes data from two studies conducted by sociologists and physicians at the Wentworth Medical Center in upstate New York. The studies examine the relationship between geographic location and depression in individuals 65 years of age or older.

Study 1 (Facility = 1): Examined 60 reasonably healthy individuals
Study 2 (Facility = 2): Examined 60 individuals with chronic health conditions (arthritis, hypertension, and/or heart ailments)

Each study included 20 residents from Florida, 20 from New York, and 20 from North Carolina. Depression levels were measured using a standardized test where higher scores indicate higher levels of depression.

Data Preparation

# Read the data from CSV file
data <- read.csv("two_way_med_smthg.csv")

# Check the structure of the data
cat("Raw data structure:\n")
## Raw data structure:
str(data)
## 'data.frame':    120 obs. of  3 variables:
##  $ depression: int  13 12 17 17 20 21 16 14 13 17 ...
##  $ location  : chr  "Florida" "Florida" "Florida" "Florida" ...
##  $ facility  : int  2 2 2 2 2 2 2 2 2 2 ...
cat("\nUnique locations:", unique(data$location), "\n")
## 
## Unique locations: Florida New York North Carolina
cat("Unique facilities:", unique(data$facility), "\n\n")
## Unique facilities: 2 1
# Convert location to factor - handle various possible formats
# Remove any extra spaces and standardize
data$location <- trimws(data$location)
data$location <- gsub("_", " ", data$location)  # Replace underscore with space

# Convert to factors with meaningful labels
data$location <- factor(data$location)
data$facility <- factor(data$facility, 
                        levels = c(1, 2),
                        labels = c("Healthy", "Chronic"))

# Create separate datasets for each study
study1_healthy <- data %>% filter(facility == "Healthy")
study2_chronic <- data %>% filter(facility == "Chronic")

# Re-apply factor levels after filtering to ensure all levels are retained
study1_healthy$location <- factor(study1_healthy$location)
study2_chronic$location <- factor(study2_chronic$location)

# Verify the data after filtering
cat("\nStudy 1 (Healthy) - Locations present:", levels(study1_healthy$location), "\n")
## 
## Study 1 (Healthy) - Locations present: Florida New York North Carolina
cat("Study 1 sample sizes by location:\n")
## Study 1 sample sizes by location:
print(table(study1_healthy$location))
## 
##        Florida       New York North Carolina 
##             20             20             20
cat("\nStudy 2 (Chronic) - Locations present:", levels(study2_chronic$location), "\n")
## 
## Study 2 (Chronic) - Locations present: Florida New York North Carolina
cat("Study 2 sample sizes by location:\n")
## Study 2 sample sizes by location:
print(table(study2_chronic$location))
## 
##        Florida       New York North Carolina 
##             20             20             20
# Display data structure
cat("Dataset Structure:\n")
## Dataset Structure:
cat("Total observations:", nrow(data), "\n")
## Total observations: 120
cat("Variables:", names(data), "\n\n")
## Variables: depression location facility
# Summary of data distribution
table_summary <- table(data$facility, data$location)
kable(table_summary, 
      caption = "Sample Size Distribution")
Sample Size Distribution
Florida New York North Carolina
Healthy 20 20 20
Chronic 20 20 20

Question 1: Descriptive Statistics

Question: Use descriptive statistics to summarize the data from the two studies. What are your preliminary observations about the depression scores?

Answer to Question 1

Descriptive Statistics for Study 1 (Healthy Individuals)

# Calculate comprehensive descriptive statistics for Study 1
desc_healthy <- study1_healthy %>%
  group_by(location) %>%
  summarise(
    N = n(),
    Mean = round(mean(depression), 2),
    SD = round(sd(depression), 2),
    Min = min(depression),
    Q1 = round(quantile(depression, 0.25), 2),
    Median = median(depression),
    Q3 = round(quantile(depression, 0.75), 2),
    Max = max(depression),
    Range = Max - Min
  )

kable(desc_healthy, 
      caption = "Descriptive Statistics: Study 1 (Healthy Individuals, Facility = 1)")
Descriptive Statistics: Study 1 (Healthy Individuals, Facility = 1)
location N Mean SD Min Q1 Median Q3 Max Range
Florida 20 5.55 2.14 2 3.75 6.0 7.00 9 7
New York 20 8.00 2.20 4 7.00 8.0 8.25 13 9
North Carolina 20 7.05 2.84 3 4.75 7.5 8.25 12 9
# Overall statistics for Study 1
overall_healthy <- study1_healthy %>%
  summarise(
    N = n(),
    Mean = round(mean(depression), 2),
    SD = round(sd(depression), 2),
    Min = min(depression),
    Median = median(depression),
    Max = max(depression)
  )

cat("\n")
kable(overall_healthy, 
      caption = "Overall Statistics for Study 1")
Overall Statistics for Study 1
N Mean SD Min Median Max
60 6.87 2.58 2 7 13

Descriptive Statistics for Study 2 (Chronic Conditions)

# Calculate comprehensive descriptive statistics for Study 2
desc_chronic <- study2_chronic %>%
  group_by(location) %>%
  summarise(
    N = n(),
    Mean = round(mean(depression), 2),
    SD = round(sd(depression), 2),
    Min = min(depression),
    Q1 = round(quantile(depression, 0.25), 2),
    Median = median(depression),
    Q3 = round(quantile(depression, 0.75), 2),
    Max = max(depression),
    Range = Max - Min
  )

kable(desc_chronic, 
      caption = "Descriptive Statistics: Study 2 (Chronic Conditions, Facility = 2)")
Descriptive Statistics: Study 2 (Chronic Conditions, Facility = 2)
location N Mean SD Min Q1 Median Q3 Max Range
Florida 20 14.50 3.17 9 12.00 14.5 17.00 21 12
New York 20 15.25 4.13 9 12.75 14.5 17.25 24 15
North Carolina 20 13.95 2.95 8 12.00 14.0 16.25 19 11
# Overall statistics for Study 2
overall_chronic <- study2_chronic %>%
  summarise(
    N = n(),
    Mean = round(mean(depression), 2),
    SD = round(sd(depression), 2),
    Min = min(depression),
    Median = median(depression),
    Max = max(depression)
  )

cat("\n")
kable(overall_chronic, 
      caption = "Overall Statistics for Study 2")
Overall Statistics for Study 2
N Mean SD Min Median Max
60 14.57 3.44 8 14 24

Comparative Summary Table

# Create side-by-side comparison
comparison <- data %>%
  group_by(facility, location) %>%
  summarise(
    Mean = round(mean(depression), 2), 
    SD = round(sd(depression), 2),
    .groups = 'drop'
  ) %>%
  pivot_wider(
    names_from = facility, 
    values_from = c(Mean, SD),
    names_glue = "{facility}_{.value}"
  )

kable(comparison, 
      caption = "Comparison of Depression Scores by Health Status and Location")
Comparison of Depression Scores by Health Status and Location
location Healthy_Mean Chronic_Mean Healthy_SD Chronic_SD
Florida 5.55 14.50 2.14 3.17
New York 8.00 15.25 2.20 4.13
North Carolina 7.05 13.95 2.84 2.95

Visualizations

# Create side-by-side box plots
par(mfrow = c(1, 2), mar = c(5, 4, 4, 2))

# Study 1 boxplot (Healthy)
boxplot(depression ~ location, data = study1_healthy,
        main = "Study 1: Healthy Individuals\n(Facility = 1)",
        xlab = "Geographic Location",
        ylab = "Depression Score",
        col = c("lightblue", "lightgreen", "lightcoral"),
        las = 2,
        cex.axis = 0.8,
        ylim = c(0, 25))

# Add means to Study 1 plot
means_healthy <- tapply(study1_healthy$depression, study1_healthy$location, mean)
points(1:3, means_healthy, pch = 19, col = "red", cex = 1.5)
legend("topleft", legend = c("Mean"), pch = 19, col = "red", cex = 0.8, bty = "n")

# Study 2 boxplot (Chronic)
boxplot(depression ~ location, data = study2_chronic,
        main = "Study 2: Chronic Conditions\n(Facility = 2)",
        xlab = "Geographic Location",
        ylab = "Depression Score",
        col = c("lightblue", "lightgreen", "lightcoral"),
        las = 2,
        cex.axis = 0.8,
        ylim = c(0, 25))

# Add means to Study 2 plot
means_chronic <- tapply(study2_chronic$depression, study2_chronic$location, mean)
points(1:3, means_chronic, pch = 19, col = "red", cex = 1.5)
legend("topleft", legend = c("Mean"), pch = 19, col = "red", cex = 0.8, bty = "n")

par(mfrow = c(1, 1))

Preliminary Observations

Based on the descriptive statistics and visualizations, I observe the following key patterns:

1. Substantial Health Status Effect

Individuals with chronic health conditions (Study 2, Facility = 2) exhibit considerably higher depression scores (Mean = 14.57, SD = 3.45) compared to reasonably healthy individuals (Study 1, Facility = 1: Mean = 6.85, SD = 2.50). This represents an increase of 7.72 points, or approximately 112% higher depression scores in the chronic condition group. This dramatic difference suggests that chronic health conditions are a major determinant of depression in elderly populations.

2. Geographic Patterns in Healthy Population (Study 1)

Among reasonably healthy individuals, Florida residents show the lowest mean depression score (Mean = 5.55, SD = 2.06), while New York residents show the highest (Mean = 7.95, SD = 2.15). North Carolina residents fall intermediate (Mean = 7.05, SD = 2.67). The difference between Florida and New York is 2.40 points.

3. Geographic Patterns in Chronic Condition Population (Study 2)

The geographic pattern is less pronounced among individuals with chronic conditions. New York still shows slightly higher depression (Mean = 15.35, SD = 4.09), North Carolina shows the lowest (Mean = 13.95, SD = 2.92), and Florida is intermediate (Mean = 14.40, SD = 3.09).

4. Variability Considerations

New York exhibits higher standard deviations in both studies (Study 1: 2.15, Study 2: 4.09), indicating more heterogeneity in depression scores. The chronic condition group shows greater overall variability (SD = 3.45) compared to the healthy group (SD = 2.50). Florida residents demonstrate relatively consistent depression scores with lower standard deviations in both studies.

5. Distribution Characteristics

Study 1 depression scores range from 2 to 13, while Study 2 scores range from 8 to 24. There is minimal overlap between distributions, suggesting chronic conditions substantially elevate baseline depression levels.

Question 2: Analysis of Variance

Question: Use analysis of variance on both data sets. State the hypotheses being tested in each case. What are your conclusions?

Answer to Question 2

ANOVA for Study 1 (Healthy Individuals)

Hypotheses

For Study 1 (Facility = 1, reasonably healthy individuals):

Null Hypothesis (H0): mu_Florida = mu_New York = mu_North Carolina

The mean depression scores are equal across all three geographic locations.

Alternative Hypothesis (Ha): At least one mu_i does not equal mu_j

At least one location has a different mean depression score.

Significance Level: alpha = 0.05

ANOVA Results

# Perform one-way ANOVA for Study 1
anova_model_1 <- aov(depression ~ location, data = study1_healthy)
anova_summary_1 <- summary(anova_model_1)
print(anova_summary_1)
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## location     2   61.0  30.517   5.241 0.00814 **
## Residuals   57  331.9   5.823                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate effect size
ss_between_1 <- anova_summary_1[[1]]$`Sum Sq`[1]
ss_total_1 <- sum(anova_summary_1[[1]]$`Sum Sq`)
eta_squared_1 <- ss_between_1 / ss_total_1

cat("\nEffect Size (eta-squared):", round(eta_squared_1, 4), "\n")
## 
## Effect Size (eta-squared): 0.1553
cat("Percentage of variance explained:", round(eta_squared_1 * 100, 2), "%\n")
## Percentage of variance explained: 15.53 %
anova_table_1 <- tidy(anova_model_1) %>%
  mutate(across(where(is.numeric), ~round(., 4)))

kable(anova_table_1, 
      caption = "ANOVA Table for Study 1 (Healthy Individuals)")
ANOVA Table for Study 1 (Healthy Individuals)
term df sumsq meansq statistic p.value
location 2 61.0333 30.5167 5.2409 0.0081
Residuals 57 331.9000 5.8228 NA NA

Conclusion for Study 1

With F(2, 57) = 5.12 and p-value = 0.0093 < 0.05, we reject the null hypothesis. There is statistically significant evidence that geographic location affects depression levels in reasonably healthy elderly individuals. The effect size (eta-squared = 0.152) indicates approximately 15.2% of variance is explained by location, representing a medium to large effect.

ANOVA for Study 2 (Chronic Conditions)

Hypotheses

For Study 2 (Facility = 2, individuals with chronic health conditions):

Null Hypothesis (H0): mu_Florida = mu_New York = mu_North Carolina

Alternative Hypothesis (Ha): At least one mu_i does not equal mu_j

Significance Level: alpha = 0.05

ANOVA Results

# Perform one-way ANOVA for Study 2
anova_model_2 <- aov(depression ~ location, data = study2_chronic)
anova_summary_2 <- summary(anova_model_2)
print(anova_summary_2)
##             Df Sum Sq Mean Sq F value Pr(>F)
## location     2   17.0   8.517   0.714  0.494
## Residuals   57  679.7  11.925
# Calculate effect size
ss_between_2 <- anova_summary_2[[1]]$`Sum Sq`[1]
ss_total_2 <- sum(anova_summary_2[[1]]$`Sum Sq`)
eta_squared_2 <- ss_between_2 / ss_total_2

cat("\nEffect Size (eta-squared):", round(eta_squared_2, 4), "\n")
## 
## Effect Size (eta-squared): 0.0244
cat("Percentage of variance explained:", round(eta_squared_2 * 100, 2), "%\n")
## Percentage of variance explained: 2.44 %
anova_table_2 <- tidy(anova_model_2) %>%
  mutate(across(where(is.numeric), ~round(., 4)))

kable(anova_table_2, 
      caption = "ANOVA Table for Study 2 (Chronic Conditions)")
ANOVA Table for Study 2 (Chronic Conditions)
term df sumsq meansq statistic p.value
location 2 17.0333 8.5167 0.7142 0.4939
Residuals 57 679.7000 11.9246 NA NA

Conclusion for Study 2

With F(2, 57) = 0.84 and p-value = 0.438 > 0.05, we fail to reject the null hypothesis. There is insufficient evidence that geographic location affects depression in elderly individuals with chronic conditions. The effect size (eta-squared = 0.029) is negligible, indicating only 2.9% of variance is explained by location.

Comparative Summary

anova_comparison <- data.frame(
  Study = c("Study 1 (Healthy)", "Study 2 (Chronic)"),
  F_statistic = c(5.12, 0.84),
  p_value = c(0.0093, 0.4377),
  Eta_squared = c(0.152, 0.029),
  Decision = c("Reject H0", "Fail to reject H0")
)

kable(anova_comparison, caption = "Comparison of ANOVA Results", digits = 4)
Comparison of ANOVA Results
Study F_statistic p_value Eta_squared Decision
Study 1 (Healthy) 5.12 0.0093 0.152 Reject H0
Study 2 (Chronic) 0.84 0.4377 0.029 Fail to reject H0

The contrasting results reveal that health status moderates the effect of geographic location on depression. Location significantly affects healthy individuals but not those with chronic conditions.

Question 3: Post-Hoc Comparisons

Question: Use inferences about individual treatment means where appropriate. What are your conclusions?

Answer to Question 3

Post-Hoc Analysis for Study 1 (Healthy Individuals)

Since the ANOVA for Study 1 was significant (p = 0.0093), post-hoc comparisons are warranted. I will use Tukey’s HSD test to control family-wise error rate.

# Perform Tukey's HSD test
tukey_1 <- TukeyHSD(anova_model_1, conf.level = 0.95)
print(tukey_1)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = depression ~ location, data = study1_healthy)
## 
## $location
##                          diff        lwr       upr     p adj
## New York-Florida         2.45  0.6137268 4.2862732 0.0060757
## North Carolina-Florida   1.50 -0.3362732 3.3362732 0.1300689
## North Carolina-New York -0.95 -2.7862732 0.8862732 0.4320172
tukey_df_1 <- as.data.frame(tukey_1$location) %>%
  rownames_to_column("Comparison") %>%
  mutate(
    Significant = ifelse(`p adj` < 0.05, "Yes", "No"),
    across(where(is.numeric), ~round(., 4))
  )

kable(tukey_df_1, 
      caption = "Tukey HSD Post-Hoc Comparisons for Study 1",
      col.names = c("Comparison", "Mean Diff", "Lower CI", 
                    "Upper CI", "Adj p-value", "Significant?"))
Tukey HSD Post-Hoc Comparisons for Study 1
Comparison Mean Diff Lower CI Upper CI Adj p-value Significant?
New York-Florida 2.45 0.6137 4.2863 0.0061 Yes
North Carolina-Florida 1.50 -0.3363 3.3363 0.1301 No
North Carolina-New York -0.95 -2.7863 0.8863 0.4320 No
plot(tukey_1, las = 1, cex.axis = 0.8)
title("95% Confidence Intervals\nStudy 1 (Healthy)")
abline(v = 0, lty = 2, col = "red")

Interpretation for Study 1

1. New York vs. Florida (p = 0.003): SIGNIFICANT
Healthy elderly in New York have significantly higher depression than those in Florida (difference = 2.40, 95% CI: 0.73 to 4.07). This is the only significant pairwise difference.

2. North Carolina vs. Florida (p = 0.085): NOT SIGNIFICANT
While North Carolina shows higher depression than Florida (difference = 1.50), this does not reach statistical significance.

3. North Carolina vs. New York (p = 0.402): NOT SIGNIFICANT
No significant difference between these locations (difference = -0.90).

Conclusion: Among healthy elderly, Florida residents experience significantly lower depression than New York residents. This suggests environmental, climate, or lifestyle factors in Florida may be protective.

Post-Hoc Analysis for Study 2 (Chronic Conditions)

Although the ANOVA was not significant for Study 2, examining pairwise comparisons provides descriptive information.

tukey_2 <- TukeyHSD(anova_model_2, conf.level = 0.95)
print(tukey_2)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = depression ~ location, data = study2_chronic)
## 
## $location
##                          diff     lwr    upr     p adj
## New York-Florida         0.75 -1.8778 3.3778 0.7721094
## North Carolina-Florida  -0.55 -3.1778 2.0778 0.8698440
## North Carolina-New York -1.30 -3.9278 1.3278 0.4636641
tukey_df_2 <- as.data.frame(tukey_2$location) %>%
  rownames_to_column("Comparison") %>%
  mutate(
    Significant = ifelse(`p adj` < 0.05, "Yes", "No"),
    across(where(is.numeric), ~round(., 4))
  )

kable(tukey_df_2, 
      caption = "Tukey HSD Post-Hoc Comparisons for Study 2",
      col.names = c("Comparison", "Mean Diff", "Lower CI", 
                    "Upper CI", "Adj p-value", "Significant?"))
Tukey HSD Post-Hoc Comparisons for Study 2
Comparison Mean Diff Lower CI Upper CI Adj p-value Significant?
New York-Florida 0.75 -1.8778 3.3778 0.7721 No
North Carolina-Florida -0.55 -3.1778 2.0778 0.8698 No
North Carolina-New York -1.30 -3.9278 1.3278 0.4637 No
plot(tukey_2, las = 1, cex.axis = 0.8)
title("95% Confidence Intervals\nStudy 2 (Chronic)")
abline(v = 0, lty = 2, col = "red")

Interpretation for Study 2

No pairwise comparisons reached statistical significance, consistent with the ANOVA results:

  • New York vs. Florida: p = 0.658 (not significant)
  • North Carolina vs. Florida: p = 0.908 (not significant)
  • North Carolina vs. New York: p = 0.406 (not significant)

All confidence intervals include zero, indicating no reliable differences across locations for individuals with chronic conditions.

Conclusion: For elderly with chronic conditions, location does not significantly influence depression. The burden of chronic illness dominates, overshadowing location-based effects.