# Load necessary libraries
library(ggplot2)  # For QQ plot
library(car)      # For Levene's test
## Loading required package: carData
library(stats)    # For Shapiro-Wilk test
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(rstatix) # For normality tests
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
library(dunn.test)
library(ggpubr)
Question 4
# Read the CSV file
df <- read.csv('/Users/rafael/Desktop/UCSB_Grad/BIOE 212/Homework/HW5_Sample_Sheet _Group_two.csv')  # Replace with actual file path
# Ensure treatment column is a factor
df$Treatment <- as.factor(df$Treatment)

# Function to compute residuals for a single dataset (one-sample model)
compute_residuals <- function(y) {
  x <- seq_along(y)  # Index values (1, 2, 3, ...)
  model <- lm(y ~ x)  # Fit linear model
  predicted <- predict(model)  # Compute predicted values
  return(y - predicted)  # Compute residuals manually
}

# Function to compute variance for a single dataset
compute_single_group_variance <- function(y) {
  return(var(y))  # Compute variance of the dataset
}

# Function to perform Levene's test for homogeneity of variances
perform_levene_test <- function(df, value_col, group_col) {
  long_df <- data.frame(values = df[[value_col]], group = df[[group_col]])  # Convert to long format
  levene_test <- leveneTest(values ~ group, data = long_df)  # Levene's test
  
  print("Levene's Test for Equal Variances:")
  print(levene_test)
  
  return(levene_test)
}

# Function to plot QQ plot for residuals
plot_qq_residuals <- function(residuals, title) {
  qq_plot <- ggplot(data.frame(residuals = residuals), aes(sample = residuals)) +
    stat_qq() + 
    stat_qq_line() + 
    ggtitle(title) +
    theme_minimal()
  
  print(qq_plot)
}

# Function to plot homoscedasticity check (Residuals vs Fitted Values)
plot_homoscedasticity <- function(anova_model, title) {
  residuals_anova <- residuals(anova_model)
  fitted_values <- fitted(anova_model)
  
  homoscedasticity_plot <- ggplot(data.frame(Fitted = fitted_values, Residuals = residuals_anova),
                                  aes(x = Fitted, y = Residuals)) +
    geom_point() +
    geom_smooth(method = "lm", se = FALSE, color = "blue") +
    ggtitle(title) +
    theme_minimal()
  
  print(homoscedasticity_plot)
}
# Compute residuals for each treatment group
residuals_list <- lapply(split(df$`nuclear_cytoplasmic_brachyury_ratio`, df$Treatment), compute_residuals)

# Perform Shapiro-Wilk test for each treatment's residuals
shapiro_results <- lapply(residuals_list, shapiro.test)
print(shapiro_results)
## $Treatment_A
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.96523, p-value = 0.6526
## 
## 
## $Treatment_B
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.97855, p-value = 0.9569
## 
## 
## $Treatment_D
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.97749, p-value = 0.7557
## 
## 
## $Treatment_F
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.92199, p-value = 0.03022
## 
## 
## $Treatment_H
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.95669, p-value = 0.48
# Compute variance for each treatment group
variances <- sapply(split(df$`nuclear_cytoplasmic_brachyury_ratio`, df$Treatment), compute_single_group_variance)
print(variances)
## Treatment_A Treatment_B Treatment_D Treatment_F Treatment_H 
## 0.005368700 0.009660559 0.027971679 0.027720539 0.005805254
# Perform Levene's test for homogeneity of variances

levene_test <- perform_levene_test(df, "nuclear_cytoplasmic_brachyury_ratio", "Treatment")
## [1] "Levene's Test for Equal Variances:"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value  Pr(>F)  
## group   4  3.0986 0.01869 *
##       105                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Perform one-way ANOVA
anova_model <- aov(`nuclear_cytoplasmic_brachyury_ratio` ~ Treatment, data = df)
anova_result <- summary(anova_model)
print(anova_result)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## Treatment     4 0.8976 0.22439   12.31 3.04e-08 ***
## Residuals   105 1.9143 0.01823                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# If ANOVA is significant, perform Tukey's post-hoc test
if (anova_result[[1]]$"Pr(>F)"[1] < 0.05) {
  posthoc <- TukeyHSD(anova_model)
  # Extract Tukey's test results into a data frame
  tukey_results <- as.data.frame(posthoc$Treatment)
  tukey_results$Comparison <- rownames(tukey_results)
  
  # Select significant comparisons (p-value < 0.05)
  tukey_significant <- tukey_results[tukey_results$`p adj` < 0.05, ]
  
  print("Significant Comparisons from Tukey's Test:")
  print(tukey_significant)
}
## [1] "Significant Comparisons from Tukey's Test:"
##                               diff          lwr        upr        p adj
## Treatment_D-Treatment_A  0.2406742  0.132480299  0.3488682 1.263292e-07
## Treatment_H-Treatment_A  0.2024478  0.083927276  0.3209683 6.496919e-05
## Treatment_D-Treatment_B  0.1855903  0.048734586  0.3224460 2.509519e-03
## Treatment_H-Treatment_B  0.1473638  0.002206455  0.2925212 4.478078e-02
## Treatment_F-Treatment_D -0.1436784 -0.240449989 -0.0469068 7.080263e-04
##                                      Comparison
## Treatment_D-Treatment_A Treatment_D-Treatment_A
## Treatment_H-Treatment_A Treatment_H-Treatment_A
## Treatment_D-Treatment_B Treatment_D-Treatment_B
## Treatment_H-Treatment_B Treatment_H-Treatment_B
## Treatment_F-Treatment_D Treatment_F-Treatment_D
# Extract significant comparisons into a list format
significant_comparisons <- lapply(strsplit(tukey_significant$Comparison, "-"), function(x) c(x[1], x[2]))


# Plot homoscedasticity (Residuals vs Fitted Values)
plot_homoscedasticity(anova_model, "Residuals vs Fitted Values")
## `geom_smooth()` using formula = 'y ~ x'

# Plot QQ plot for residuals of the ANOVA model
plot_qq_residuals(residuals(anova_model), "QQ Plot of ANOVA Residuals")

# Plot with significance bars using ggpubr
ggplot(df, aes(x = Treatment, y = `nuclear_cytoplasmic_brachyury_ratio`)) +
  stat_summary(fun = mean, geom = "bar", fill = "lightblue", color = "black", alpha = 0.7) +  # Mean as bar
  stat_summary(fun.data = function(x) {
    mean_x <- mean(x)
    sd_x <- sd(x)
    data.frame(y = mean_x, ymin = mean_x - sd_x, ymax = mean_x + sd_x)
  }, geom = "errorbar", width = 0.2, color = "black") +  # SD error bars
  theme_minimal() +
  labs(title = "Mean Nuclear/Cytoplasmic Brachyury Ratio by Treatment Group",
       x = "Treatment Group",
       y = "Nuclear/Cytoplasmic Brachyury Ratio") +
  stat_compare_means(method = "anova", label.y = max(df$nuclear_cytoplasmic_brachyury_ratio) + 0.5) +  # ANOVA p-value at the top
  stat_compare_means(comparisons = significant_comparisons, 
                     method = "t.test", 
                     label = "p.signif", 
                     p.adjust.method = "BH")  # Apply Benjamini-Hochberg correction

Question 5
# Standard Curve Data
concentration <- c(0.46, 0.34, 0.17, 0.11, 0.027, 0.013, 0.007, 0) # mM
absorbance <- c(1.111, 1.112, 0.876, 0.832, 0.551, 0.505, 0.398, 0.387) # Average absorbance from wells


standard_model <- lm(log(concentration + 1) ~ absorbance)



# Get R-squared value
r_squared <- summary(standard_model)$r.squared

# Plot the standard curve
ggplot(data = data.frame(concentration, absorbance), aes(x = concentration, y = absorbance)) +
  geom_point(size = 3, color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  ggtitle("Standard Curve: Absorbance vs. ORO Concentration") +
  xlab("Concentration of ORO (mM)") +
  ylab("Absorbance") +
  annotate("text", x = 0.2, y = 2.5, label = paste("y =", round(coef(standard_model)[2], 3), "*x +", round(coef(standard_model)[1], 3))) +
  annotate("text", x = 0.2, y = 2.3, label = paste("R² =", round(r_squared, 3))) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Function to calculate concentration from absorbance using the standard curve equation
predict_concentration <- function(abs_value) {
  (abs_value - coef(standard_model)[1]) / coef(standard_model)[2]
}
# Step 1: Compute Residuals
compute_residuals <- function(y) {
  x <- seq_along(y)  # Index values (1, 2, 3, ...)
  model <- lm(y ~ x)  # Fit linear model
  predicted <- predict(model)  # Compute predicted values
  return(y - predicted)  # Compute residuals manually
}

# Function to plot Q-Q plot for each group
plot_qq_by_group <- function(df, value_col, group_col) {
  unique_groups <- unique(df[[group_col]])
  
  for (group in unique_groups) {
    group_data <- df[df[[group_col]] == group, value_col]  # Subset data for the group
    qq_plot <- ggplot(data.frame(residuals = group_data), aes(sample = residuals)) +
      stat_qq() + 
      stat_qq_line() + 
      ggtitle(paste("Q-Q Plot for", group)) +
      theme_minimal()
    
    print(qq_plot)
  }
}

# Function to perform Shapiro-Wilk test for each group
perform_shapiro_by_group <- function(df, value_col, group_col) {
  unique_groups <- unique(df[[group_col]])
  shapiro_results <- list()
  
  for (group in unique_groups) {
    group_data <- df[df[[group_col]] == group, value_col]  # Subset data for the group
    shapiro_test <- shapiro.test(group_data)  # Perform test
    shapiro_results[[group]] <- shapiro_test
    
    print(paste("Shapiro-Wilk Test for", group))
    print(shapiro_test)
  }
  
  return(shapiro_results)
}

# Step 4: Compute Variance for Each Group
compute_group_variances <- function(df, value_col, group_col) {
  variances <- aggregate(df[[value_col]], by = list(df[[group_col]]), FUN = var)
  colnames(variances) <- c("Group", "Variance")
  
  print("Variances for Each Group:")
  print(variances)
  
  return(variances)
}

# Step 5: Create Homoscedasticity Plot (Residuals vs Fitted Values)
plot_homoscedasticity <- function(anova_model, title) {
  residuals_anova <- residuals(anova_model)
  fitted_values <- fitted(anova_model)
  
  homoscedasticity_plot <- ggplot(data.frame(Fitted = fitted_values, Residuals = residuals_anova),
                                  aes(x = Fitted, y = Residuals)) +
    geom_point() +
    geom_smooth(method = "lm", se = FALSE, color = "blue") +
    ggtitle(title) +
    theme_minimal()
  
  print(homoscedasticity_plot)
}

# Step 6: Perform Levene's test for homogeneity of variances
perform_levene_test <- function(df, value_col, group_col) {
  long_df <- data.frame(values = df[[value_col]], group = df[[group_col]])  # Convert to long format
  levene_test <- leveneTest(values ~ group, data = long_df)  # Levene's test
  
  print("Levene's Test for Equal Variances:")
  print(levene_test)
  
  return(levene_test)
}

# Load Sample Data
sample_data <- data.frame(
  Group = rep(c("Myo", "Osteo", "Carti", "Teno", "Uni", "Blank"), each = 8),
  Replicate = rep(rep(1:4, each = 2), times = 6),
  Well = c(
    "A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8",  
    "A9", "A10", "A11", "A12", "B1", "B2", "B3", "B4",  
    "B5", "B6", "B7", "B8", "B9", "B10", "B11", "B12",  
    "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8",  
    "C9", "C10", "C11", "C12", "D1", "D2", "D3", "D4",  
    "D5", "D6", "D7", "D8", "D9", "D10", "D11", "D12"   
  ),
  Absorbance = c(
    0.187, 0.175, 0.174, 0.187, 0.178, 0.172, 0.181, 0.179,  
    0.217, 0.225, 0.234, 0.227, 0.194, 0.207, 0.206, 0.203,  
    0.209, 0.208, 0.195, 0.189, 0.191, 0.200, 0.195, 0.194,  
    0.185, 0.422, 0.289, 0.404, 0.172, 0.170, 0.165, 0.164,  
    0.171, 0.163, 0.166, 0.168, 0.172, 0.181, 0.179, 0.220,  
    0.226, 0.214, 0.230, 0.217, 0.225, 0.234, 0.227, 0.194   
  )
)

# Convert absorbance to concentration
sample_data$Concentration <- sapply(sample_data$Absorbance, predict_concentration)


# Perform Analysis
residuals_data <- compute_residuals(sample_data$Absorbance)

# Run Q-Q plots for each treatment group
plot_qq_by_group(sample_data, "Absorbance", "Group")

# Run Shapiro-Wilk tests for each treatment group
shapiro_results <- perform_shapiro_by_group(sample_data, "Absorbance", "Group")
## [1] "Shapiro-Wilk Test for Myo"
## 
##  Shapiro-Wilk normality test
## 
## data:  group_data
## W = 0.91687, p-value = 0.405
## 
## [1] "Shapiro-Wilk Test for Osteo"
## 
##  Shapiro-Wilk normality test
## 
## data:  group_data
## W = 0.95543, p-value = 0.7656
## 
## [1] "Shapiro-Wilk Test for Carti"
## 
##  Shapiro-Wilk normality test
## 
## data:  group_data
## W = 0.88514, p-value = 0.2107
## 
## [1] "Shapiro-Wilk Test for Teno"
## 
##  Shapiro-Wilk normality test
## 
## data:  group_data
## W = 0.74551, p-value = 0.007285
## 
## [1] "Shapiro-Wilk Test for Uni"
## 
##  Shapiro-Wilk normality test
## 
## data:  group_data
## W = 0.73032, p-value = 0.004937
## 
## [1] "Shapiro-Wilk Test for Blank"
## 
##  Shapiro-Wilk normality test
## 
## data:  group_data
## W = 0.86129, p-value = 0.1237
group_variances <- compute_group_variances(sample_data, "Absorbance", "Group")
## [1] "Variances for Each Group:"
##   Group     Variance
## 1 Blank 1.601250e-04
## 2 Carti 5.541071e-05
## 3   Myo 3.183929e-05
## 4 Osteo 1.904107e-04
## 5  Teno 1.229227e-02
## 6   Uni 3.322857e-04
# Fit ANOVA model to check homoscedasticity
anova_model <- aov(Absorbance ~ Group, data = sample_data)
plot_homoscedasticity(anova_model, "Homoscedasticity Check: Residuals vs Fitted")
## `geom_smooth()` using formula = 'y ~ x'

# Perform Levene's Test for homogeneity of variances
perform_levene_test(sample_data, "Absorbance", "Group")
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## [1] "Levene's Test for Equal Variances:"
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value  Pr(>F)   
## group  5  3.6817 0.00748 **
##       42                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value  Pr(>F)   
## group  5  3.6817 0.00748 **
##       42                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Perform Kruskal-Wallis 
kruskal_test <- kruskal.test(Absorbance ~ Group, data = sample_data)

# Print Kruskal-Wallis results
print("Kruskal-Wallis Test:")
## [1] "Kruskal-Wallis Test:"
print(kruskal_test)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Absorbance by Group
## Kruskal-Wallis chi-squared = 22.119, df = 5, p-value = 0.0004971
# Perform Benjamini-Hochberg post-hoc test 
# Perform pairwise Dunn’s test
# Perform Dunn’s test with Benjamini-Hochberg correction
dunn_results <- dunn.test(sample_data$Concentration, sample_data$Group, method = "bh")
##   Kruskal-Wallis rank sum test
## 
## data: x and group
## Kruskal-Wallis chi-squared = 22.1187, df = 5, p-value = 0
## 
## 
##                            Comparison of x by group                            
##                              (Benjamini-Hochberg)                              
## Col Mean-|
## Row Mean |      Blank      Carti        Myo      Osteo       Teno
## ---------+-------------------------------------------------------
##    Carti |   1.572111
##          |     0.0966
##          |
##      Myo |   3.322871   1.750760
##          |    0.0033*     0.0857
##          |
##    Osteo |   0.482352  -1.089759  -2.840519
##          |     0.3373     0.1724    0.0084*
##          |
##     Teno |   2.179518   0.607406  -1.143353   1.697165
##          |     0.0439     0.3136     0.1724     0.0841
##          |
##      Uni |   3.698034   2.125923   0.375162   3.215682   1.518516
##          |    0.0016*     0.0419     0.3538    0.0033*     0.0967
## 
## alpha = 0.05
## Reject Ho if p <= alpha/2
# Extract pairwise comparisons & adjusted p-values
significance_table <- data.frame(
  group1 = sapply(strsplit(dunn_results$comparisons, " - "), `[`, 1),
  group2 = sapply(strsplit(dunn_results$comparisons, " - "), `[`, 2),
  p.adj = dunn_results$P.adjusted
)

# Print significance results
print(significance_table)
##    group1 group2       p.adj
## 1   Blank  Carti 0.096603940
## 2   Blank    Myo 0.003341095
## 3   Carti    Myo 0.085700566
## 4   Blank  Osteo 0.337261964
## 5   Carti  Osteo 0.172387072
## 6     Myo  Osteo 0.008445025
## 7   Blank   Teno 0.043939812
## 8   Carti   Teno 0.313604463
## 9     Myo   Teno 0.172426201
## 10  Osteo   Teno 0.084061260
## 11  Blank    Uni 0.001629564
## 12  Carti    Uni 0.041887057
## 13    Myo    Uni 0.353769646
## 14  Osteo    Uni 0.003253370
## 15   Teno    Uni 0.096663154
ggplot(sample_data, aes(x = Group, y = Concentration, fill = Group)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.6) +
  ggtitle("Quantified Concentration of Stain Across Groups") +
  xlab("Group") +
  ylab("Stain Concentration (mM)") +
  theme_minimal() +
  stat_compare_means(
    method = "kruskal.test", 
    label.y = max(sample_data$Concentration) + 0.4, 
    label.x = 5  # Centers the label horizontally
  ) +  
  stat_compare_means(
    comparisons = list(c("Myo", "Osteo"), c("Myo", "Carti"), c("Myo", "Teno"),
                       c("Myo", "Uni"), c("Myo", "Blank"), c("Osteo", "Carti")),
    method = "wilcox.test",  # Dunn’s test is based on Wilcoxon test
    p.adjust.method = "BH",
    label = "p.signif"
  ) +
  theme(plot.title = element_text(hjust = 0.5))  # Center title
## Warning in wilcox.test.default(c(0.86354511408227, 0.837169049328785,
## 0.834971043932661, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.86354511408227, 0.837169049328785,
## 0.834971043932661, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.86354511408227, 0.837169049328785,
## 0.834971043932661, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.86354511408227, 0.837169049328785,
## 0.834971043932661, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.86354511408227, 0.837169049328785,
## 0.834971043932661, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.929485275965982, 0.947069319134971,
## 0.966851367700085, : cannot compute exact p-value with ties