phage_data <- read.csv(file.choose())

phage_data <- read.csv(file.choose(), fileEncoding = “latin1”) str(phage_data) head(phage_data) Load Dataset phage_data <- read.csv(“PhageR/PHAGE_BIOSTATS_R.csv”) Examine Structure and Head str(phage_data) head(phage_data) Clean Column Names phage_data <- phage_data %>% clean_names() Convert to Factors phage_data <- phage_data %>% mutate( phage_type = as.factor(phage_type), host_species = as.factor(host_species), lysis_success = as.factor(lysis_success) ) Part B: Descriptive Statistics Summary Statistics summary(phage_data\(genome_length_kb) summary(phage_data\)burst_size)

sd(phage_data\(genome_length_kb) sd(phage_data\)burst_size) Mode Function mode_func <- function(x) { ux <- unique(x) ux[which.max(tabulate(match(x, ux)))] }

mode_func(phage_data\(genome_length_kb) mode_func(phage_data\)burst_size) Missing Values colSums(is.na(phage_data)) Numeric Subset numeric_data <- phage_data %>% select(where(is.numeric)) Filter Data lytic_subset <- phage_data %>% filter(phage_type == “Lytic”, burst_size > 100) Create New Variable phage_data <- phage_data %>% mutate(burst_per_minute = burst_size / latent_period_min) Part C: Data Visualization Histograms ggplot(phage_data, aes(genome_length_kb)) + geom_histogram() + ggtitle(“Genome Length Distribution”)

ggplot(phage_data, aes(burst_size)) + geom_histogram() + ggtitle(“Burst Size Distribution”) Boxplot ggplot(phage_data, aes(phage_type, plaque_diameter_mm)) + geom_boxplot() + ggtitle(“Plaque Diameter by Phage Type”) Scatterplot with Regression ggplot(phage_data, aes(genome_length_kb, burst_size, color = phage_type)) + geom_point() + geom_smooth(method = “lm”) + ggtitle(“Genome Length vs Burst Size”) Barplots ggplot(phage_data, aes(phage_type)) + geom_bar() + ggtitle(“Phage Type Distribution”)

ggplot(phage_data, aes(host_species)) + geom_bar() + ggtitle(“Host Species Distribution”) Part D: Statistical Testing Normality Test shapiro.test(phage_data$plaque_diameter_mm)

qqnorm(phage_data\(plaque_diameter_mm) qqline(phage_data\)plaque_diameter_mm) Levene’s Test leveneTest(plaque_diameter_mm ~ phage_type, data = phage_data) T-tests t.test(phage_data$plaque_diameter_mm, mu = 3)

t.test(plaque_diameter_mm ~ phage_type, data = phage_data)

wilcox.test(plaque_diameter_mm ~ phage_type, data = phage_data) ANOVA & Kruskal-Wallis anova_result <- aov(burst_size ~ host_species, data = phage_data) summary(anova_result)

kruskal.test(burst_size ~ host_species, data = phage_data) Part E: Chi-square, Correlation and Regression Contingency Table table_data <- table(phage_data\(phage_type, phage_data\)lysis_success) table_data Chi-square & Fisher Test chisq.test(table_data)

fisher.test(table_data) Correlation cor(phage_data\(genome_length_kb, phage_data\)gc_content_pct, method = “pearson”)

cor(phage_data\(burst_size, phage_data\)latent_period_min, method = “spearman”) Regression Models lm_simple <- lm(burst_size ~ genome_length_kb, data = phage_data) summary(lm_simple)

lm_multiple <- lm(burst_size ~ genome_length_kb + latent_period_min + gc_content_pct, data = phage_data) summary(lm_multiple) Logistic Regression log_model <- glm(lysis_success ~ phage_type + plaque_diameter_mm + eop, data = phage_data, family = binomial) summary(log_model) Part F: Survival Analysis surv_obj <- Surv(phage_data$phage_survival_days)

fit <- survfit(surv_obj ~ phage_type, data = phage_data)

ggsurvplot(fit, data = phage_data, pval = TRUE, title = “Survival Curve by Phage Type”)