options(repos = c(CRAN = "https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))

install.packages("phyloseq")
## Warning: package 'phyloseq' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
install.packages("vegan")
## package 'vegan' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\86183\AppData\Local\Temp\RtmpK4U4yF\downloaded_packages
install.packages("ggplot2")
## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\86183\AppData\Local\Temp\RtmpK4U4yF\downloaded_packages
library(phyloseq)
library(vegan)
## Warning: package 'vegan' was built under R version 4.4.3
## Loading required package: permute
## Loading required package: lattice
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# 输入OTU表格(不等重复)
otu_data <- read.csv("C:/data1/group_modified.csv", header = TRUE, row.names = 1)

# 提取OTU丰度数据
otu_matrix <- otu_data[, 1:13]  # 假设前十三列是样本丰度
colnames(otu_matrix) <- c("BFO1", "BFO2", "BFO3", "BFO4", "BFOH2", "BFOH3", "RFO1", "RFO2", "RFO3", "RFO4", "RFOH1", "RFOH3","RFOH4")  # 样本名称

sample_data <- data.frame(
  SampleID = c("BFO1","BFO2","BFO3","BFO4",
               "BFOH2","BFOH3",
               "RFO1","RFO2","RFO3","RFO4",
               "RFOH1","RFOH3","RFOH4"),
  Group = rep(c("BFO","BFOH","RFO","RFOH"), 
              times = c(4,2,4,3)),
  stringsAsFactors = FALSE
)

# 设置行名
rownames(sample_data) <- sample_data$SampleID

# 查看创建的数据
head(sample_data)
##       SampleID Group
## BFO1      BFO1   BFO
## BFO2      BFO2   BFO
## BFO3      BFO3   BFO
## BFO4      BFO4   BFO
## BFOH2    BFOH2  BFOH
## BFOH3    BFOH3  BFOH
#创建样本(分组)信息
SAM <- sample_data(sample_data)


# 提取分类信息
tax_data <- otu_data[, 14] # 第十四列是分类信息
tax_strings <- as.character(tax_data)  # 确保转换为字符类型
tax_matrix <- matrix(NA, nrow = length(tax_strings), ncol = 7)
colnames(tax_matrix) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")

for (i in 1:length(tax_strings)) {
  # 按逗号拆分,并去除空白字符
  split_tax <- trimws(unlist(strsplit(tax_strings[i], ",")))
  
  # 提取每个层级的名称(去掉前缀如 "k:")
  for (j in 1:length(split_tax)) {
    level <- unlist(strsplit(split_tax[j], ":"))[1]  # 如 "k", "p", "c"...
    value <- unlist(strsplit(split_tax[j], ":"))[2]  # 如 "Bacteria", "Proteobacteria"...
    
    # 根据前缀匹配到对应的列
    if (level == "k") tax_matrix[i, "Kingdom"] <- value
    if (level == "p") tax_matrix[i, "Phylum"] <- value
    if (level == "c") tax_matrix[i, "Class"] <- value
    if (level == "o") tax_matrix[i, "Order"] <- value
    if (level == "f") tax_matrix[i, "Family"] <- value
    if (level == "g") tax_matrix[i, "Genus"] <- value
    if (level == "s") tax_matrix[i, "Species"] <- value
  }
}

# 设置行名为OTU名称
rownames(tax_matrix) <- rownames(otu_data)

# 检查结果
head(tax_matrix[, "Phylum"])
##                 OTU_10000                 OTU_10001                 OTU_10002 
##          "Proteobacteria" "unclassified_k_Bacteria"            "Bacteroidota" 
##                 OTU_10003                 OTU_10004                  OTU_1000 
## "unclassified_k_Bacteria" "unclassified_k_Bacteria" "unclassified_k_Bacteria"
# 处理部分未分类结果
tax_matrix[tax_matrix == "unclassified_k_Bacteria"] <- "unclassified"
#tax_matrix[tax_matrix == "unclassified_p_Proteobacteria"] <- "unclassified"

# 转换为phyloseq对象
TAX <- tax_table(tax_matrix)
OTU <- otu_table(otu_matrix, taxa_are_rows = TRUE)
physeq <- phyloseq(OTU, TAX, SAM)

#########################
# alpha diversity
library(tidyr)

# 计算α多样性指数
alpha_div <- estimate_richness(physeq, measures = c("Chao1", "Shannon", "Simpson"))

# 添加分组信息
meta_data <- data.frame(sample_data(physeq))
alpha_div$SampleID <- rownames(alpha_div)
alpha_div <- merge(alpha_div, meta_data, by.x = "SampleID", by.y = "row.names")
head(alpha_div)
##   SampleID    Chao1 se.chao1  Shannon   Simpson SampleID.y Group
## 1     BFO1 2741.227 32.30895 5.169566 0.9744869       BFO1   BFO
## 2     BFO2 3265.002 33.95917 5.243843 0.9717939       BFO2   BFO
## 3     BFO3 2928.869 35.74525 5.123405 0.9674471       BFO3   BFO
## 4     BFO4 3756.750 28.35577 5.632139 0.9830748       BFO4   BFO
## 5    BFOH2 3159.353 34.43258 5.351750 0.9775836      BFOH2  BFOH
## 6    BFOH3 3305.828 30.34769 5.354603 0.9746705      BFOH3  BFOH
alpha_long <- alpha_div %>%
  select(SampleID, Group, Chao1, Shannon, Simpson) %>%
  pivot_longer(cols = c(Chao1, Shannon, Simpson), names_to = "Index", values_to = "Value")

# 将指数名称转为因子并控制绘图顺序
alpha_long$Index <- factor(alpha_long$Index, levels = c("Chao1", "Shannon", "Simpson"))

# 自定配色方案
my_colors <- c("#D0B4BE", "#B6D6E4", "#8DB8A7", "#EDDE92") # 按分组数量调整

# 绘制分面箱线图
p_combined <- ggplot(alpha_long, aes(x = Group, y = Value, fill = Group)) +
  geom_boxplot(alpha = 0.8) +
  geom_jitter(width = 0.2, size = 1.5, alpha = 0.5) +
  facet_wrap(~Index, scales = "free_y", nrow = 1) + # 横向排列
  labs(x = "", y = "Alpha Diversity Value") +
  theme_bw() +
  scale_fill_manual(values = my_colors) +
  scale_color_manual(values = my_colors) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "right"
  )

print(p_combined)

#######################
# beta diversity

if (!require("ape")) install.packages("ape") # 用于PCoA分析
## Loading required package: ape
## 
## Attaching package: 'ape'
## The following object is masked from 'package:dplyr':
## 
##     where
library(ape)

# 计算距离矩阵
dist_matrix <- phyloseq::distance(physeq, method = "bray")  # 或 "unifrac"(如果有树)

# 执行PCoA
pcoa_result <- ape::pcoa(dist_matrix)

# 提取主坐标并合并样本信息
pcoa_scores <- as.data.frame(pcoa_result$vectors[, 1:2])  # 取前两轴
colnames(pcoa_scores) <- c("PCoA1", "PCoA2")
pcoa_scores$SampleID <- rownames(pcoa_scores)
pcoa_data <- merge(pcoa_scores, 
                   data.frame(sample_data(physeq)), 
                   by.x = "SampleID", 
                   by.y = "row.names")

# 自定义颜色和形状(确保组间可区分)
color <- c("#D0B4BE", "#B6D6E4", "#8DB8A7", "#EDDE92")  # 按分组数量调整
shapes <- c(16, 17, 15, 18)  # 不同形状区分组别

# 绘制PCoA图形
ggplot(pcoa_data, aes(x = PCoA1, y = PCoA2, 
                      color = Group, shape = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  stat_ellipse(data = subset(pcoa_data, Group %in% c("BFO", "RFO")),  # 只对A、B组画椭圆
               level = 0.8, linetype = 2, linewidth = 0.5) +
  scale_color_manual(values = color) +
  scale_shape_manual(values = shapes) +
  labs(x = paste0("PCoA1 (", round(pcoa_result$values$Relative_eig[1] * 100, 1), "%)"),
       y = paste0("PCoA2 (", round(pcoa_result$values$Relative_eig[2] * 100, 1), "%)"),
       title = "Beta Diversity PCoA (Bray-Curtis Distance)") +
  theme_bw() +
  theme(legend.position = "right") +
  annotate("text", x = Inf, y = Inf, 
           label = paste("BFO: n=4\nBFOH: n=2\nRFO: n=4\nRFOH: n=3"), 
           hjust = 1.1, vjust = 1.1)

###############
# defining URL
url <- "https://example.com/somefile.zip"

# defining address
destfile <- "D:output.word"


# use "curl" to download
download.file(url, destfile, method = "curl")