# Load necessary libraries
library(ggplot2)  # For QQ plot
library(car)      # For Levene's test
## Loading required package: carData
library(stats)    # For Shapiro-Wilk test
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(rstatix) # For normality tests
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
library(dunn.test)
library(ggpubr)
Question 4
# Read the CSV file
df <- read.csv('/Users/rafael/Desktop/UCSB_Grad/BIOE 212/Homework/HW5_Sample_Sheet _Group_two.csv')  # Replace with actual file path
# Ensure treatment column is a factor
df$Treatment <- as.factor(df$Treatment)

# Function to compute residuals for a single dataset (one-sample model)
compute_residuals <- function(y) {
  x <- seq_along(y)  # Index values (1, 2, 3, ...)
  model <- lm(y ~ x)  # Fit linear model
  predicted <- predict(model)  # Compute predicted values
  return(y - predicted)  # Compute residuals manually
}

# Function to compute variance for a single dataset
compute_single_group_variance <- function(y) {
  return(var(y))  # Compute variance of the dataset
}

# Function to perform Levene's test for homogeneity of variances
perform_levene_test <- function(df, value_col, group_col) {
  long_df <- data.frame(values = df[[value_col]], group = df[[group_col]])  # Convert to long format
  levene_test <- leveneTest(values ~ group, data = long_df)  # Levene's test
  
  print("Levene's Test for Equal Variances:")
  print(levene_test)
  
  return(levene_test)
}

# Function to plot QQ plot for residuals
plot_qq_residuals <- function(residuals, title) {
  qq_plot <- ggplot(data.frame(residuals = residuals), aes(sample = residuals)) +
    stat_qq() + 
    stat_qq_line() + 
    ggtitle(title) +
    theme_minimal()
  
  print(qq_plot)
}

# Function to plot homoscedasticity check (Residuals vs Fitted Values)
plot_homoscedasticity <- function(anova_model, title) {
  residuals_anova <- residuals(anova_model)
  fitted_values <- fitted(anova_model)
  
  homoscedasticity_plot <- ggplot(data.frame(Fitted = fitted_values, Residuals = residuals_anova),
                                  aes(x = Fitted, y = Residuals)) +
    geom_point() +
    geom_smooth(method = "lm", se = FALSE, color = "blue") +
    ggtitle(title) +
    theme_minimal()
  
  print(homoscedasticity_plot)
}
# Compute residuals for each treatment group
residuals_list <- lapply(split(df$`nuclear_cytoplasmic_brachyury_ratio`, df$Treatment), compute_residuals)

# Perform Shapiro-Wilk test for each treatment's residuals
shapiro_results <- lapply(residuals_list, shapiro.test)
print(shapiro_results)
## $Treatment_A
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.96523, p-value = 0.6526
## 
## 
## $Treatment_B
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.97855, p-value = 0.9569
## 
## 
## $Treatment_D
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.97749, p-value = 0.7557
## 
## 
## $Treatment_F
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.92199, p-value = 0.03022
## 
## 
## $Treatment_H
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.95669, p-value = 0.48
# Compute variance for each treatment group
variances <- sapply(split(df$`nuclear_cytoplasmic_brachyury_ratio`, df$Treatment), compute_single_group_variance)
print(variances)
## Treatment_A Treatment_B Treatment_D Treatment_F Treatment_H 
## 0.005368700 0.009660559 0.027971679 0.027720539 0.005805254
# Perform Levene's test for homogeneity of variances

levene_test <- perform_levene_test(df, "nuclear_cytoplasmic_brachyury_ratio", "Treatment")
## [1] "Levene's Test for Equal Variances:"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value  Pr(>F)  
## group   4  3.0986 0.01869 *
##       105                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Perform one-way ANOVA
anova_model <- aov(`nuclear_cytoplasmic_brachyury_ratio` ~ Treatment, data = df)
anova_result <- summary(anova_model)
print(anova_result)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## Treatment     4 0.8976 0.22439   12.31 3.04e-08 ***
## Residuals   105 1.9143 0.01823                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# If ANOVA is significant, perform Tukey's post-hoc test
if (anova_result[[1]]$"Pr(>F)"[1] < 0.05) {
  posthoc <- TukeyHSD(anova_model)
  print(posthoc)
}
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = nuclear_cytoplasmic_brachyury_ratio ~ Treatment, data = df)
## 
## $Treatment
##                                diff          lwr         upr     p adj
## Treatment_B-Treatment_A  0.05508394 -0.090073451  0.20024134 0.8297043
## Treatment_D-Treatment_A  0.24067423  0.132480299  0.34886816 0.0000001
## Treatment_F-Treatment_A  0.09699584 -0.011198095  0.20518977 0.1010740
## Treatment_H-Treatment_A  0.20244779  0.083927276  0.32096831 0.0000650
## Treatment_D-Treatment_B  0.18559029  0.048734586  0.32244599 0.0025095
## Treatment_F-Treatment_B  0.04191189 -0.094943808  0.17876760 0.9141267
## Treatment_H-Treatment_B  0.14736385  0.002206455  0.29252124 0.0447808
## Treatment_F-Treatment_D -0.14367839 -0.240449989 -0.04690680 0.0007080
## Treatment_H-Treatment_D -0.03822644 -0.146420373  0.06996749 0.8633936
## Treatment_H-Treatment_F  0.10545195 -0.002741979  0.21364589 0.0598591
# Plot homoscedasticity (Residuals vs Fitted Values)
plot_homoscedasticity(anova_model, "Residuals vs Fitted Values")
## `geom_smooth()` using formula = 'y ~ x'

# Plot QQ plot for residuals of the ANOVA model
plot_qq_residuals(residuals(anova_model), "QQ Plot of ANOVA Residuals")

# Plot results 
ggplot(df, aes(x = Treatment, y = `nuclear_cytoplasmic_brachyury_ratio`)) +
  stat_summary(fun = mean, geom = "bar", fill = "lightblue", color = "black", alpha = 0.7) +  # Mean as bar
  stat_summary(fun.data = function(x) {
    mean_x <- mean(x)
    sd_x <- sd(x)
    data.frame(y = mean_x, ymin = mean_x - sd_x, ymax = mean_x + sd_x)
  }, geom = "errorbar", width = 0.2, color = "black") +  # SD error bars
  theme_minimal() +
  labs(title = "Mean Nuclear/Cytoplasmic Brachyury Ratio by Treatment Group",
       x = "Treatment Group",
       y = "Nuclear/Cytoplasmic Brachyury Ratio")

Question 5
# Standard Curve Data
concentration <- c(0.46, 0.46, 0.34, 0.34, 0.17, 0.17, 0.11, 0.11, 0.027, 0.027, 0.013, 0.013, 0.007, 0.007, 0, 0) # mM
absorbance <- c(1.111, 1.112, 0.876, 0.832, 0.551, 0.505, 0.398, 0.387, 0.221, 0.293, 0.2, 0.198, 0.187, 0.19, 0.165, 0.177) # Average absorbance from wells


standard_model <- lm(log(concentration + 1) ~ absorbance)



# Get R-squared value
r_squared <- summary(standard_model)$r.squared

# Plot the standard curve
ggplot(data = data.frame(concentration, absorbance), aes(x = concentration, y = absorbance)) +
  geom_point(size = 3, color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  ggtitle("Standard Curve: Absorbance vs. ORO Concentration") +
  xlab("Concentration of ORO (mM)") +
  ylab("Absorbance") +
  annotate("text", x = 0.2, y = 2.5, label = paste("y =", round(coef(standard_model)[2], 3), "*x +", round(coef(standard_model)[1], 3))) +
  annotate("text", x = 0.2, y = 2.3, label = paste("R² =", round(r_squared, 3))) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Function to calculate concentration from absorbance using the standard curve equation
predict_concentration <- function(abs_value) {
  (abs_value - coef(standard_model)[1]) / coef(standard_model)[2]
}
# Step 1: Compute Residuals
compute_residuals <- function(y) {
  x <- seq_along(y)  # Index values (1, 2, 3, ...)
  model <- lm(y ~ x)  # Fit linear model
  predicted <- predict(model)  # Compute predicted values
  return(y - predicted)  # Compute residuals manually
}

# Function to plot Q-Q plot for each group
plot_qq_by_group <- function(df, value_col, group_col) {
  unique_groups <- unique(df[[group_col]])
  
  for (group in unique_groups) {
    group_data <- df[df[[group_col]] == group, value_col]  # Subset data for the group
    qq_plot <- ggplot(data.frame(residuals = group_data), aes(sample = residuals)) +
      stat_qq() + 
      stat_qq_line() + 
      ggtitle(paste("Q-Q Plot for", group)) +
      theme_minimal()
    
    print(qq_plot)
  }
}

# Function to perform Shapiro-Wilk test for each group
perform_shapiro_by_group <- function(df, value_col, group_col) {
  unique_groups <- unique(df[[group_col]])
  shapiro_results <- list()
  
  for (group in unique_groups) {
    group_data <- df[df[[group_col]] == group, value_col]  # Subset data for the group
    shapiro_test <- shapiro.test(group_data)  # Perform test
    shapiro_results[[group]] <- shapiro_test
    
    print(paste("Shapiro-Wilk Test for", group))
    print(shapiro_test)
  }
  
  return(shapiro_results)
}

# Step 4: Compute Variance for Each Group
compute_group_variances <- function(df, value_col, group_col) {
  variances <- aggregate(df[[value_col]], by = list(df[[group_col]]), FUN = var)
  colnames(variances) <- c("Group", "Variance")
  
  print("Variances for Each Group:")
  print(variances)
  
  return(variances)
}

# Step 5: Create Homoscedasticity Plot (Residuals vs Fitted Values)
plot_homoscedasticity <- function(anova_model, title) {
  residuals_anova <- residuals(anova_model)
  fitted_values <- fitted(anova_model)
  
  homoscedasticity_plot <- ggplot(data.frame(Fitted = fitted_values, Residuals = residuals_anova),
                                  aes(x = Fitted, y = Residuals)) +
    geom_point() +
    geom_smooth(method = "lm", se = FALSE, color = "blue") +
    ggtitle(title) +
    theme_minimal()
  
  print(homoscedasticity_plot)
}

# Step 6: Perform Levene's test for homogeneity of variances
perform_levene_test <- function(df, value_col, group_col) {
  long_df <- data.frame(values = df[[value_col]], group = df[[group_col]])  # Convert to long format
  levene_test <- leveneTest(values ~ group, data = long_df)  # Levene's test
  
  print("Levene's Test for Equal Variances:")
  print(levene_test)
  
  return(levene_test)
}

# Load Sample Data
sample_data <- data.frame(
  Group = rep(c("Myo", "Osteo", "Carti", "Teno", "Uni", "Blank"), each = 8),
  Replicate = rep(rep(1:4, each = 2), times = 6),
  Well = c(
    "A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8",  
    "A9", "A10", "A11", "A12", "B1", "B2", "B3", "B4",  
    "B5", "B6", "B7", "B8", "B9", "B10", "B11", "B12",  
    "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8",  
    "C9", "C10", "C11", "C12", "D1", "D2", "D3", "D4",  
    "D5", "D6", "D7", "D8", "D9", "D10", "D11", "D12"   
  ),
  Absorbance = c(
    0.187, 0.175, 0.174, 0.187, 0.178, 0.172, 0.181, 0.179,  
    0.22, 0.226, 0.214, 0.23, 0.217, 0.225, 0.234, 0.227,  
    0.194, 0.207, 0.206, 0.203, 0.2, 0.192, 0.208, 0.192,  
    0.209, 0.208, 0.195, 0.189, 0.191, 0.2, 0.195, 0.194,  
    0.11, 0.291, 0.143, 0.224, 0.185, 0.422, 0.289, 0.404,  
    0.172, 0.17, 0.165, 0.164, 0.171, 0.163, 0.166, 0.168   
  )
)

sample_data$Group <- as.factor(sample_data$Group)


str(sample_data)  # Ensure Group is a factor and Absorbance is numeric
## 'data.frame':    48 obs. of  4 variables:
##  $ Group     : Factor w/ 6 levels "Blank","Carti",..: 3 3 3 3 3 3 3 3 4 4 ...
##  $ Replicate : int  1 1 2 2 3 3 4 4 1 1 ...
##  $ Well      : chr  "A1" "A2" "A3" "A4" ...
##  $ Absorbance: num  0.187 0.175 0.174 0.187 0.178 0.172 0.181 0.179 0.22 0.226 ...
summary(sample_data)  # Check for missing values or outliers
##    Group     Replicate        Well             Absorbance    
##  Blank:8   Min.   :1.00   Length:48          Min.   :0.1100  
##  Carti:8   1st Qu.:1.75   Class :character   1st Qu.:0.1747  
##  Myo  :8   Median :2.50   Mode  :character   Median :0.1940  
##  Osteo:8   Mean   :2.50                      Mean   :0.2045  
##  Teno :8   3rd Qu.:3.25                      3rd Qu.:0.2147  
##  Uni  :8   Max.   :4.00                      Max.   :0.4220
# Convert absorbance to concentration
sample_data$Concentration <- sapply(sample_data$Absorbance, predict_concentration)


# Perform Analysis
residuals_data <- compute_residuals(sample_data$Absorbance)

# Run Q-Q plots for each treatment group
plot_qq_by_group(sample_data, "Absorbance", "Group")

# Run Shapiro-Wilk tests for each treatment group
shapiro_results <- perform_shapiro_by_group(sample_data, "Absorbance", "Group")
## [1] "Shapiro-Wilk Test for Myo"
## 
##  Shapiro-Wilk normality test
## 
## data:  group_data
## W = 0.91687, p-value = 0.405
## 
## [1] "Shapiro-Wilk Test for Osteo"
## 
##  Shapiro-Wilk normality test
## 
## data:  group_data
## W = 0.9721, p-value = 0.9139
## 
## [1] "Shapiro-Wilk Test for Carti"
## 
##  Shapiro-Wilk normality test
## 
## data:  group_data
## W = 0.86708, p-value = 0.1411
## 
## [1] "Shapiro-Wilk Test for Teno"
## 
##  Shapiro-Wilk normality test
## 
## data:  group_data
## W = 0.88514, p-value = 0.2107
## 
## [1] "Shapiro-Wilk Test for Uni"
## 
##  Shapiro-Wilk normality test
## 
## data:  group_data
## W = 0.93804, p-value = 0.5919
## 
## [1] "Shapiro-Wilk Test for Blank"
## 
##  Shapiro-Wilk normality test
## 
## data:  group_data
## W = 0.93635, p-value = 0.5756
group_variances <- compute_group_variances(sample_data, "Absorbance", "Group")
## [1] "Variances for Each Group:"
##   Group     Variance
## 1 Blank 1.141071e-05
## 2 Carti 4.592857e-05
## 3   Myo 3.183929e-05
## 4 Osteo 4.498214e-05
## 5  Teno 5.541071e-05
## 6   Uni 1.312486e-02
# Fit ANOVA model to check homoscedasticity
anova_model <- aov(Absorbance ~ Group, data = sample_data)
plot_homoscedasticity(anova_model, "Homoscedasticity Check: Residuals vs Fitted")
## `geom_smooth()` using formula = 'y ~ x'

# Perform Levene's Test for homogeneity of variances
perform_levene_test(sample_data, "Absorbance", "Group")
## [1] "Levene's Test for Equal Variances:"
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value    Pr(>F)    
## group  5  18.833 8.773e-10 ***
##       42                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value    Pr(>F)    
## group  5  18.833 8.773e-10 ***
##       42                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Ensure Group is a factor
sample_data$Group <- as.factor(sample_data$Group)

# Perform Welch’s ANOVA
welch_anova <- oneway.test(Absorbance ~ Group, data = sample_data, var.equal = FALSE)

# Print Welch’s ANOVA results
print("Welch's ANOVA Results:")
## [1] "Welch's ANOVA Results:"
print(welch_anova)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  Absorbance and Group
## F = 98.5, num df = 5.000, denom df = 18.977, p-value = 6.699e-13
# Ensure Group is a factor
sample_data$Group <- as.factor(sample_data$Group)

# Perform Games-Howell post-hoc test
games_howell_results <- sample_data %>%
  games_howell_test(Absorbance ~ Group)

# Print full Games-Howell test results
print("Full Games-Howell Test Results:")
## [1] "Full Games-Howell Test Results:"
print(games_howell_results)
## # A tibble: 15 × 8
##    .y.        group1 group2 estimate conf.low conf.high       p.adj p.adj.signif
##  * <chr>      <chr>  <chr>     <dbl>    <dbl>     <dbl>       <dbl> <chr>       
##  1 Absorbance Blank  Carti   0.0329   0.0236    0.0421      2   e-6 ****        
##  2 Absorbance Blank  Myo     0.0117   0.00388   0.0196      3   e-3 **          
##  3 Absorbance Blank  Osteo   0.0568   0.0476    0.0659      7.25e-9 ****        
##  4 Absorbance Blank  Teno    0.0302   0.0202    0.0403      1.36e-5 ****        
##  5 Absorbance Blank  Uni     0.0911  -0.0624    0.245       3.23e-1 ns          
##  6 Absorbance Carti  Myo    -0.0211  -0.0314   -0.0109      1.24e-4 ***         
##  7 Absorbance Carti  Osteo   0.0239   0.0128    0.0349      6.58e-5 ****        
##  8 Absorbance Carti  Teno   -0.00263 -0.0143    0.00906     9.74e-1 ns          
##  9 Absorbance Carti  Uni     0.0582  -0.0952    0.212       7.09e-1 ns          
## 10 Absorbance Myo    Osteo   0.0450   0.0348    0.0552      1.42e-8 ****        
## 11 Absorbance Myo    Teno    0.0185   0.00755   0.0294      9.46e-4 ***         
## 12 Absorbance Myo    Uni     0.0794  -0.0741    0.233       4.43e-1 ns          
## 13 Absorbance Osteo  Teno   -0.0265  -0.0381   -0.0149      3.81e-5 ****        
## 14 Absorbance Osteo  Uni     0.0344  -0.119     0.188       9.48e-1 ns          
## 15 Absorbance Teno   Uni     0.0609  -0.0926    0.214       6.75e-1 ns
# Extract significant comparisons (p < 0.05)
significant_comparisons <- games_howell_results %>%
  filter(p.adj < 0.05) %>%
  select(group1, group2, p.adj, p.adj.signif)

# ✅ Convert comparisons into correct format for ggplot (list of vectors)
sig_comparisons_list <- split(significant_comparisons[, c("group1", "group2")], seq(nrow(significant_comparisons)))
sig_comparisons_list <- lapply(sig_comparisons_list, as.character)  # Ensure correct type

# ✅ Dynamically adjust y-position for significance bars
# Adjust space for significance bars dynamically
max_y <- max(sample_data$Absorbance) -0.1  # Start position for bars
spacing_factor <- 0.05  # 🔹 Increase or decrease this for more or less spacing

y_positions <- seq(max_y, max_y + (length(sig_comparisons_list) - 1) * spacing_factor, by = spacing_factor)


# ✅ Prevent error if no significant comparisons exist
if (length(sig_comparisons_list) > 0) {
  
  # Create the boxplot with significance bars
  ggplot(sample_data, aes(x = Group, y = Absorbance, fill = Group)) +
    geom_boxplot(alpha = 0.7, outlier.shape = NA) +  # Remove outliers for clarity
    geom_jitter(width = 0.2, alpha = 0.6, size = 2, color = "black") +  # Improve visibility of points
    ggtitle("Quantified Absorbance of Stain Across Groups") +
    xlab("Group") +
    ylab("Absorbance") +
    theme_minimal() +

    # Welch’s ANOVA p-value annotation
    stat_compare_means(
      method = "anova",
      label.y = max_y + 0.5, 
      label.x.npc = "center"
    ) +

    # Games-Howell Post-Hoc Test Significance Bars
    geom_signif(
      comparisons = sig_comparisons_list,
      map_signif_level = TRUE,  # Uses significance levels (*, **, ***)
      y_position = y_positions,
      tip_length = 0.03,  # Adjusts length of bar tips
      textsize = 5  # Adjusts significance label size
    ) +

    # Formatting improvements
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold"),
      axis.text.x = element_text(angle = 45, hjust = 1),
      legend.position = "right"
    )

} else {
  print("No significant pairwise comparisons found. No significance bars added to plot.")
}
## Warning in wilcox.test.default(c(0.172, 0.17, 0.165, 0.164, 0.171, 0.163, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.172, 0.17, 0.165, 0.164, 0.171, 0.163, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.172, 0.17, 0.165, 0.164, 0.171, 0.163, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.194, 0.207, 0.206, 0.203, 0.2, 0.192, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.194, 0.207, 0.206, 0.203, 0.2, 0.192, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.187, 0.175, 0.174, 0.187, 0.178, 0.172, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.187, 0.175, 0.174, 0.187, 0.178, 0.172, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.22, 0.226, 0.214, 0.23, 0.217, 0.225, :
## cannot compute exact p-value with ties