Code to be used for statewide cyano gene qPCR (Bend Geentics) + toxin data

set up

install packages

library(knitr)
library(seqinr)
library(reshape2)
library(ggplot2)
library(vegan)
library(stringr)
library(plyr)
#detach(package:dplyr)
library(readxl)
library(ggpubr)
library(segmented)
set.seed(100)

upload files

path<-"/Users/kylielanglois/OneDrive - SCCWRP/algae/FHABs/FHAB_geneTox/"
d<-read.csv(file.path(path, "2023-2025_WB_Toxins_qPCR Data_110725.csv"))

ceden<-read_excel(file.path("/Users/kylielanglois/OneDrive - SCCWRP/algae/mASCI", 
                            "CEDEN_stationLU.xlsx"), sheet = 1)
ceden<-ceden[!duplicated(ceden$StationCode), ]

edit stationcodes, where possible, from CEDEN/SWAMP StationDescription

d$Location<-gsub("554SJC556", "544SJC556", d$Location)
d$Location<-gsub("633RLCK5", "633RLCK05", d$Location)
d$Location<-gsub("637ELGB01", "637EL6B", d$Location)
d$Location<-gsub("Cache Creek Camp Haswell", "560XCCHWC", d$Location)
d$Location<-gsub("American River @ Watt Ave.", "514SAC011", d$Location)
d$Location<-gsub("General Creek", "634S18359", d$Location)
d$Location<-gsub("Gold Run", "637CE0655", d$Location)
d$Location<-gsub("Laguna Lake at Dock", "310LAGDOC", d$Location)
d$Location<-gsub("Lake Clementine", "514PLA071", d$Location)
d$Location<-gsub("Lake Evans E Shore", "801PEL155", d$Location)
d$Location<-gsub("Lake Evans NE Shore", "801PEL155", d$Location)
d$Location<-gsub("Lake Evans SE Shore", "801PEL155", d$Location)
d$Location<-gsub("Lake Gregory", "628PGL147", d$Location)
d$Location<-gsub("Lake Hemet N. Shore", "802LHN004", d$Location)
d$Location<-gsub("Lake Nacimiento Day Use Area", "309NACIDU", d$Location) 
d$Location<-gsub("Lake Nacimiento Marina Launch", "309NACIMA", d$Location)
d$Location<-gsub("Lake Nacimiento\\b", "309NACIMA", d$Location)
d$Location<-gsub("Lake Pillsbury", "111LKPL01", d$Location)
d$Location<-gsub("Lake San Antonio N. Shore", "309SANTMC", d$Location)
d$Location<-gsub("Lopez Lake Cottonwood Cove", "310PLL106", d$Location)
d$Location<-gsub("Lopez Lake Kayak Dock", "310PLL106", d$Location)
d$Location<-gsub("Lost Lake Unnamed Pond", "801LL0001", d$Location)
d$Location<-gsub("Lost Lakes", "801LL0001", d$Location)
d$Location<-gsub("Oso Flaco Creek", "312OFL" , d$Location)
d$Location<-gsub("Prosser Creek Reservoir", "635PS0294", d$Location)
d$Location<-gsub("San Joaquin River", "BG30", d$Location)
d$Location<-gsub("Ventura River", "402VNTEST", d$Location)
d$Location<-gsub("Upper Truckee River", "3-404", d$Location)
d$Location<-gsub("Tallac Creek", "SLT-TALL-00", d$Location)
d$Location<-gsub("Spring Pond", "909SPNGPD", d$Location)
d$Location<-gsub("Spring Lake", "114PSP009", d$Location)
d$Location<-gsub("Smith Creek", "204R00189", d$Location)
d$Location<-gsub("Santa Margarita lake", "309SMLMAR", d$Location)

organize into supertypes

d.loc<-merge(ceden, d, by.x="StationCode", by.y="Location", all.y = T)
#merge with CEDEN data

#supertypes from Jayme Smith
d.loc$Sample.SuperType<-
  ifelse(grepl("AlgalMat", d.loc$Sample.Type), "AlgalMat",
         ifelse(grepl("Algal Mat", d.loc$Sample.Type), "AlgalMat",
                ifelse(grepl("Scum", d.loc$Sample.Type), "Scum", 
                       ifelse(grepl("Water", d.loc$Sample.Type), "Water", ""))))

#supertypes from Jayme Smith
d.loc$System.Supertype<-
  ifelse(grepl("Lake|Pond|Marina|Beach|Reservoir|pond|Cove|Levee", 
               d.loc$StationCode), "nonflowing", 
         ifelse(grepl("River|Creek|Eel|Channel|Run|Canal", 
                      d.loc$StationCode), "flowing", NA))

#supertypes from Jayme Smith
d.loc$System.Supertype<-
  ifelse(!is.na(d.loc$System.Supertype), d.loc$System.Supertype, 
  ifelse(grepl("Falls|Trib|River|Creek|Channel|Klamath", 
               d.loc$StationName), "flowing", 
  ifelse(grepl("Lake|Cove|Reservoir|Bay|Pond|Slough|pond|Lagoon|",
               d.loc$StationName), "nonflowing", NA)))

#estimate regions
d.loc$Region<-sub("_.*", "", d.loc$Project.Name)
d.loc$Region<-gsub("RWB", "", d.loc$Region)
d.loc$Region<-gsub("RWQCB", "", d.loc$Region)
d.loc$Region<-gsub("SWAMP", "6", d.loc$Region)

library(dplyr)
#summarize supertypes
d.loc.summ <- d.loc %>%
  group_by(System.Supertype, Sample.SuperType) %>%
  summarise(samples=length(Sample.SuperType))
kable(d.loc.summ)
System.Supertype Sample.SuperType samples
flowing AlgalMat 92
flowing Scum 3
flowing Water 45
nonflowing AlgalMat 10
nonflowing Scum 11
nonflowing Water 130
NA AlgalMat 29
NA Scum 3
NA Water 25

how many samples?

d.loc1<-d.loc
d.loc1[,20:27]<-lapply(d.loc1[,20:27], gsub, pattern="([0-9]+).*$", replacement=1)
#if value, it was tested
d.loc1[,20:27]<-lapply(d.loc1[,20:27], gsub, pattern="ND", replacement=1)
#if non-detect, it was tested
d.loc1[,20:27]<-lapply(d.loc1[,20:27], gsub, pattern="^$", replacement=NA)
#if NA, it was NOT tested
d.loc1[,20:27]<-sapply(d.loc1[,20:27], as.numeric)

d.loc1$tox.present<-rowSums(d.loc1[, grepl("ug.L", colnames(d.loc1))], na.rm = T)
#count toxin tests
d.loc1$gene.present<-rowSums(d.loc1[, grepl("mL", colnames(d.loc1))], na.rm = T)
#count qPCR tests

#count paired toxin-qPCR tests for each group
d.loc1$ANA<-rowSums(d.loc1[, 
                           grepl("anaC|Anatox", colnames(d.loc1))], na.rm = T)
d.loc1$MYC<-rowSums(d.loc1[, 
                           grepl("mcy|Microcys", colnames(d.loc1))], na.rm = T)
d.loc1$CYL<-rowSums(d.loc1[, 
                           grepl("cyr|Cylind", colnames(d.loc1))], na.rm = T)
d.loc1$SXT<-rowSums(d.loc1[, 
                           grepl("sxt|Sax", colnames(d.loc1))], na.rm = T)

#reshape for plotting
d.tox1.m<-melt(d.loc1[, 20:29])
d.tox1.m$group<-ifelse(grepl("anaC|Anatox", d.tox1.m$variable), "ANA", 
                       ifelse(grepl("mcy|Microcys", d.tox1.m$variable), "MCY", 
                       ifelse(grepl("cyr|Cylind", d.tox1.m$variable), "CYL",
                       ifelse(grepl("sxt|Sax", d.tox1.m$variable), "SXT", NA))))
d.tox1.m$assay<-ifelse(grepl("ug.L", d.tox1.m$variable), "toxin", "gene")

gt2<-ggplot(data = d.tox1.m)+
  geom_bar(aes(x=group, y=value, fill=assay), 
                 stat = "identity", position = position_stack())+
  scale_fill_manual(values=c("#669900", "#ab75bd"), name="", 
                    labels=c("gene", "toxin"))+
  facet_grid(Sample.SuperType~System.Supertype, scales = "free_y")+
  labs(x="assays", y="# samples")+
  theme_bw()+
  theme(legend.position = "bottom")
gt2

paired sample prep

d.loc2<-d.loc1[!is.na(d.loc1$System.Supertype), ] #only those with systems

d.ana<-d.loc2$LIMS.ID[d.loc2$ANA==2] #paired gene-toxin ONLY
df.ana<-d.loc[d.loc$LIMS.ID %in% d.ana, ] #get all info of paired
df.ana.m<-melt(df.ana[, grepl("LIMS|Super|anaC|Anat", colnames(df.ana))], 
               id.vars = c("LIMS.ID", "Sample.SuperType", "System.Supertype")) 
#only need some columns

#repeat for all genes/toxins
d.myc<-d.loc2$LIMS.ID[d.loc2$MYC==2] #paired gene-toxin ONLY
df.myc<-d.loc[d.loc$LIMS.ID %in% d.myc, ] #get all info of paired
df.myc.m<-melt(df.myc[, grepl("LIMS|Super|mcyE|Micro", colnames(df.myc))], 
               id.vars = c("LIMS.ID", "Sample.SuperType", "System.Supertype"))

d.cyl<-d.loc2$LIMS.ID[d.loc2$CYL==2] #paired gene-toxin ONLY
df.cyl<-d.loc[d.loc$LIMS.ID %in% d.cyl, ] #get all info of paired
df.cyl.m<-melt(df.cyl[, grepl("LIMS|Super|cyr|Cyl", colnames(df.cyl))], 
               id.vars = c("LIMS.ID", "Sample.SuperType", "System.Supertype"))

d.sxt<-d.loc2$LIMS.ID[d.loc2$SXT==2] #paired gene-toxin ONLY
df.sxt<-d.loc[d.loc$LIMS.ID %in% d.sxt, ] #get all info of paired
df.sxt.m<-melt(df.sxt[, grepl("LIMS|Super|sxt|Saxi", colnames(df.sxt))], 
               id.vars = c("LIMS.ID", "Sample.SuperType", "System.Supertype"))

df.all<-rbind(df.ana.m, df.myc.m, df.cyl.m, df.sxt.m) 
#combine columns of interest

#change numbers with commas into numeric
df.all$value<-gsub(",", "", df.all$value)
df.all$value<-gsub("ND", "0", df.all$value) #CHANGE ND TO 0 (for plotting)
df.all$value<-as.numeric(df.all$value)

#for plotting
df.all$assay<-ifelse(grepl("qPCR", df.all$variable), 
                     "gene (copies per mL)", "toxin (ug per L)")
df.all$group<-ifelse(grepl("anaC|Anatox", df.all$variable), "ANA", 
                       ifelse(grepl("mcy|Microcys", df.all$variable), "MCY", 
                       ifelse(grepl("cyr|Cylind", df.all$variable), "CYL",
                       ifelse(grepl("sxt|Sax", df.all$variable), "SXT", NA))))
df.all$supertype<-paste(df.all$System.Supertype, df.all$Sample.SuperType)
df.all$supertype<-gsub("nonflowing", "NF", df.all$supertype)
df.all$supertype<-gsub("flowing", "F", df.all$supertype)

#get region
df.all$uniqueID<-paste(df.all$LIMS.ID, df.all$supertype, df.all$group)
df.all<-merge(df.all, d.loc[,grepl("LIMS|Region", colnames(d.loc))], 
              by="LIMS.ID", all.x = T)

how many paired samples?

just look at presence/absence

#df.all.3<-df.all[!is.na(df.all$value), ]

apg<-ggplot(data = df.all)+
  geom_tile(aes(x=supertype, y=assay, fill=log10(value)))+
  scale_fill_gradient2(low = "black", mid = "#558000", high = "white", 
                       midpoint = 1, na.value = "grey")+
  facet_grid(.~group, scales = "free_x", space = "free_x")+
  theme_bw()+
  theme(axis.text.x =element_text(angle = 45, hjust = 1),
        axis.text.y =element_text(angle = 45, hjust = 1),
        text = element_text(size=14), 
        axis.title = element_blank(), 
        legend.position = "top", 
        legend.direction = "horizontal")
apg

look at actual concentrations

#summarized paired samples
df.all.sum <- df.all %>%
  group_by(variable) %>%
  summarise(assays=length(value)-sum(is.na(value)))
kable(df.all.sum)
variable assays
qPCR.anaC..copies.mL. 120
Anatoxin.a..ug.L. 120
qPCR.mcyE..copies.mL. 149
Microcystin.Nod…ug.L. 149
qPCR.cyrA..copies.mL. 26
Cylindrospermopsin..ug.L. 26
qPCR.sxtA..copies.mL. 24
Saxitoxin..ug.L. 24
df.all.1<-df.all[grepl("qPCR", df.all$variable), ] #genes
df.all.2<-df.all[!grepl("qPCR", df.all$variable), ] #toxins

#summary stats of paired genes
df.harm.gene.summ <- df.all.1[df.all.1$value>0, ] %>%
  group_by(System.Supertype, Sample.SuperType, variable) %>%
  summarise(min=min(value, na.rm = T), 
            max=max(value, na.rm = T), 
            mean=mean(value, na.rm = T), 
            count=length(variable))

#summary stats of paired toxin
df.harm.tox.summ <- df.all.2[df.all.2$value>0, ] %>%
  group_by(System.Supertype, Sample.SuperType, variable) %>%
  summarise(min=min(value, na.rm = T), 
            max=max(value, na.rm = T), 
            mean=mean(value, na.rm = T), 
            count=length(variable))

#put together to plot
df.harm.summ<-rbind(df.harm.gene.summ, df.harm.tox.summ)

hg<-ggplot(data = df.all)+
  geom_boxplot(aes(x=supertype, y=log10(value), color=System.Supertype))+
  geom_point(aes(x=supertype, y=log10(value), 
                 color=System.Supertype, shape=Sample.SuperType), 
             alpha=0.5, size=3)+
  facet_grid(assay~group, scales = "free_y")+
  labs(x="", y="log10 transformed", 
       title = 
         paste("FHABs 2023-2025 harmonized samples\nwith paired gene-toxin data", 
       Sys.Date()))+
  scale_shape_manual(values=c(1,2,6))+
  scale_color_manual(values=c("#2985a3", "#14b814"))+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        legend.position = "bottom")+
  guides(color=guide_legend(direction = "vertical"), 
         shape=guide_legend(direction = "vertical"))
hg

### linear regression of all paired

df.all.4<-merge(df.all.1, 
                df.all.2[, grepl("value|unique", colnames(df.all.2))], 
                by="uniqueID")
colnames(df.all.4)<-gsub("value.x", "gene", colnames(df.all.4))
colnames(df.all.4)<-gsub("value.y", "toxin", colnames(df.all.4))

lnp<-ggplot(data = df.all.4)+
  geom_point(aes(x=log10(gene), y=log10(toxin), 
                 color=group, shape=group), size=3, alpha=0.75)+
  facet_grid(System.Supertype~Sample.SuperType)+
  scale_shape_manual(values=c(15, 3, 16, 4))+
  scale_color_manual(values=c("#5ab4ac", "#8c510a", "#1b7837", "#762a83"))+
  theme_bw()+
  theme(text = element_text(size=14))
lnp

logarithmic regression of all paired

colnames(df.all.4)
##  [1] "uniqueID"         "LIMS.ID"          "Sample.SuperType" "System.Supertype"
##  [5] "variable"         "gene"             "assay"            "group"           
##  [9] "supertype"        "Region"           "toxin"
df.all.4$gene[df.all.4$gene==0]<-NA
df.all.4$toxin[df.all.4$toxin==0]<-NA

df.all.g<-df.all.4[, match(c("Sample.SuperType", "System.Supertype", 
                             "gene", "toxin", "group"), 
                           colnames(df.all.4))]

#MCY and ANA only
df.g<-df.all.g[grepl("MCY|ANA", df.all.g$group), ]
df.g$supertype<-paste(df.g$System.Supertype, df.g$Sample.SuperType)
df.g$supertype<-gsub("nonflowing", "NF", df.g$supertype)
df.g$supertype<-gsub("flowing", "F", df.g$supertype)

ng<-ggplot(data = df.g)+
  geom_abline(slope = 1, intercept = 1.25, color="grey", alpha=0.75)+
  geom_abline(slope = 1, intercept = 5, color="grey", alpha=0.75)+
  geom_vline(xintercept=0, lty=2, color="red")+
  geom_vline(xintercept=1, lty=2, color="red")+
  geom_qq(aes(sample=log10(gene)), shape=16)+
  geom_qq(aes(sample=log10(toxin)), shape=8)+
  facet_grid(supertype~group)+
  ggtitle("qq plot, .log10(gene), *log10(toxin)")+
  theme_bw()
ng

dg<-ggplot(data = df.g)+
  geom_density(aes(x=log10(gene)))+
  geom_density(aes(x=log10(toxin)), color="red")+
  facet_grid(supertype~group)+
  ggtitle("density plot, black:log10(gene), red: log10(toxin)")+
  theme_bw()
dg

df.g.a<-df.all.g[grepl("ANA", df.all.g$group), ]
df.g.a<-df.g.a[, !grepl("group", colnames(df.g.a))]
df.g.a$toxin.pa<-ifelse(df.g.a$toxin>0, 1, 0)
df.g.a<-subset(df.g.a, !is.na(df.g.a$gene))
df.g.a<-subset(df.g.a, !is.na(df.g.a$toxin))

df.g.a$gene.log10<-log10(df.g.a$gene)
df.g.a$toxin.log10<-log10(df.g.a$toxin)
df.g.a$toxin.log10<-ifelse(df.g.a$toxin.log10<0, 0, df.g.a$toxin.log10)

df.g.a1<-df.g.a[!grepl("nonflowing", df.g.a$System.Supertype), ]
df.g.a1<-df.g.a1[grepl("AlgalMat", df.g.a1$Sample.SuperType), ]
df.g.a1<-df.g.a1[, !grepl("SuperType", colnames(df.g.a1), ignore.case = T)]

glm.1 <- glm(toxin~gene, data=df.g.a1[], 
             family = Gamma(link = "inverse"))
#Gamma(link = "inverse")
#binomial(link="logit")
summary(glm.1)
## 
## Call:
## glm(formula = toxin ~ gene, family = Gamma(link = "inverse"), 
##     data = df.g.a1[])
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.966  -3.471  -2.582  -1.935   8.433  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  2.825e-04  2.555e-04   1.106    0.275
## gene        -3.170e-13  2.002e-12  -0.158    0.875
## 
## (Dispersion parameter for Gamma family taken to be 35.1535)
## 
##     Null deviance: 417.75  on 46  degrees of freedom
## Residual deviance: 417.08  on 45  degrees of freedom
## AIC: 641.21
## 
## Number of Fisher Scoring iterations: 13

linear regression of MCY (microcystin, MCYE)

prep

df.all.4.mcy<-df.all.4[grepl("MCY", df.all.4$group), ] #just MCY
df.all.4.mcy<-df.all.4.mcy[!is.na(df.all.4.mcy$toxin),] #remove NAs
df.all.4.mcy<-df.all.4.mcy[!is.na(df.all.4.mcy$gene),] #remove NAs

#for linear model
df.all.4.mcy1<-df.all.4.mcy
df.all.4.mcy1$zero<-df.all.4.mcy1$gene+df.all.4.mcy1$toxin
df.all.4.mcy1<-subset(df.all.4.mcy1, df.all.4.mcy1$zero>0) 
#remove both toxin and gene are zero

model, plot

#linear models of each supertype
lm1<-lm(log10(toxin) ~ log10(gene), 
        data = df.all.4.mcy1[df.all.4.mcy1$supertype=="NF Scum",])
summary(lm1)
## 
## Call:
## lm(formula = log10(toxin) ~ log10(gene), data = df.all.4.mcy1[df.all.4.mcy1$supertype == 
##     "NF Scum", ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7245 -0.5016 -0.3602  0.6260  1.1594 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -5.7744     1.8177  -3.177  0.01915 * 
## log10(gene)   1.1877     0.2842   4.180  0.00582 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7688 on 6 degrees of freedom
## Multiple R-squared:  0.7444, Adjusted R-squared:  0.7017 
## F-statistic: 17.47 on 1 and 6 DF,  p-value: 0.005815
lm2<-lm(log10(toxin) ~ log10(gene), 
        data = df.all.4.mcy1[df.all.4.mcy1$supertype=="NF Water",])
summary(lm2)
## 
## Call:
## lm(formula = log10(toxin) ~ log10(gene), data = df.all.4.mcy1[df.all.4.mcy1$supertype == 
##     "NF Water", ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0262 -0.3765 -0.1162  0.3240  1.5742 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.29961    0.43197  -7.639 1.71e-07 ***
## log10(gene)  0.77769    0.08234   9.445 5.22e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6173 on 21 degrees of freedom
## Multiple R-squared:  0.8094, Adjusted R-squared:  0.8004 
## F-statistic:  89.2 on 1 and 21 DF,  p-value: 5.218e-09
lm3<-lm(log10(toxin) ~ log10(gene), 
        data = df.all.4.mcy1[df.all.4.mcy1$supertype=="F Water",])
summary(lm3)
## 
## Call:
## lm(formula = log10(toxin) ~ log10(gene), data = df.all.4.mcy1[df.all.4.mcy1$supertype == 
##     "F Water", ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9127 -0.1326  0.1651  0.1823  0.4004 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -2.0857     0.7781  -2.680  0.03152 * 
## log10(gene)   0.6017     0.1472   4.087  0.00465 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4176 on 7 degrees of freedom
## Multiple R-squared:  0.7047, Adjusted R-squared:  0.6625 
## F-statistic:  16.7 on 1 and 7 DF,  p-value: 0.004648
lm.mcy.combo<-df.all.4.mcy1[grepl("NF Scum|NF Water|F Water", df.all.4.mcy1$supertype), ]

#plot all supertypes with linear smoothing, separate by facet
lnp2<-ggplot(data = lm.mcy.combo)+
  geom_point(aes(x=log10(gene), y=log10(toxin), shape=Region), 
             size=3, alpha=0.75, color="#1b7837")+
  geom_smooth(aes(x=log10(gene), y=log10(toxin)), 
              method = "lm")+
  scale_shape_manual(values=c(15, 16, 17, 18, 8, 4))+
  facet_grid(~supertype, scales = "free_x")+
  theme_bw()+
  theme(text = element_text(size=14), 
        legend.position = "bottom", 
        legend.direction = "horizontal")+
  guides(shape=guide_legend(ncol = 6))
lnp2

linear regression of ANA (anatoxin, ANA)

prep

df.all.4.ana<-df.all.4[grepl("ANA", df.all.4$group), ] #get just ana
df.all.4.ana<-df.all.4.ana[!is.na(df.all.4.ana$toxin),] #remove NAs
df.all.4.ana<-df.all.4.ana[!is.na(df.all.4.ana$gene),] #remove NAs

model, plot

df.all.4.ana.a<-df.all.4.ana[df.all.4.ana$supertype=="F AlgalMat",]
#just sample supertype with most data points

#remove outliers
df.all.4.ana.a1<-subset(df.all.4.ana.a, df.all.4.ana.a$toxin<5000)
df.all.4.ana.a1<-subset(df.all.4.ana.a1, df.all.4.ana.a1$gene<12e7)

df.all.ana.select<-d.loc[d.loc$LIMS.ID %in% df.all.4.ana.a1$LIMS.ID, ]

#linear model
lm.an1<-lm((toxin) ~ (gene), data = df.all.4.ana.a1)
summary(lm.an1)
## 
## Call:
## lm(formula = (toxin) ~ (gene), data = df.all.4.ana.a1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1070.38   -38.17   -32.63    31.41   969.69 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.455e+01  5.018e+01   0.689    0.495    
## gene        3.809e-05  2.220e-06  17.157   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 310.7 on 41 degrees of freedom
## Multiple R-squared:  0.8777, Adjusted R-squared:  0.8748 
## F-statistic: 294.4 on 1 and 41 DF,  p-value: < 2.2e-16
#plot linear model, with linear smoothing
lnp3<-ggplot(data = df.all.4.ana.a1, 
             aes(x=(gene), y=(toxin)))+
  geom_point(aes(shape=Region), 
             size=3, alpha=0.75, color="#1b7837")+
  geom_smooth(method = "lm")+
  scale_shape_manual(values=c(15, 16, 17, 18, 8, 4))+
  facet_grid(~supertype, scales = "free_x")+
  theme_bw()+
  theme(text = element_text(size=14), 
        legend.position = "bottom", 
        legend.direction = "horizontal")+
  guides(shape=guide_legend(ncol = 6))
lnp3