R Final Project Assignment

Part 1 Variation Graphs:

#have to use this to have the plots next to each other
par(mfrow=c(1, 2))

#esmerald Data
df<-read_excel("ESMERALD.xlsx")

## New names:
## • `voucher` -> `voucher...8`
## • `voucher` -> `voucher...9`
## • `voucher` -> `voucher...10`
## • `voucher` -> `voucher...11`
## • `voucher` -> `voucher...12`
## • `STEMDBH` -> `STEMDBH...15`
## • `STEMDBH` -> `STEMDBH...16`
## • `STEMDBH` -> `STEMDBH...17`
## • `STEMDBH` -> `STEMDBH...18`
## • `STEMDBH` -> `STEMDBH...19`
## • `STEMDBH` -> `STEMDBH...20`
## • `STEMDBH` -> `STEMDBH...21`
## • `STEMDBH` -> `STEMDBH...22`
## • `STEMDBH` -> `STEMDBH...23`

df$Line<-as.numeric(df$Line)
df<-df[order(df$Line),]

#cleans up missing values
df<-df[!is.na(df[,"N(IND)"]),]

df_line<-aggregate(df[,"N(IND)"],list(Line=df$Line),sum)
names(df_line)[2]<-"N(IND)"
dbh_cols<-grep("STEMDBH",colnames(df))

#set the columns to numeric so the calculations actually work
df[dbh_cols]<-lapply(df[dbh_cols],as.numeric)

#calculations - given from lab 3
df[,"DBHsum"]<-rowSums(df[,dbh_cols],na.rm=TRUE)
df_line[,"DBHsum"]<-aggregate(df$DBHsum,list(df$Line),sum)$x
df_line[,"DBHmean"]<-df_line[,"DBHsum"]/df_line[,"N(IND)"]

#chose green this time for a more forest-like vibe :)
barplot(DBHmean~Line,data=df_line,main="Esmerald Variation",col="lightgreen",ylim=c(0, 14))

#esmerald diversity calculations
#following Lab 3 slides instead of hardcoding like I did in my first progress update
df[,"Prop"]<-df[,"N(IND)"]/df_line[match(df$Line,df_line$Line),"N(IND)"]

#simpson Index 
df[,"PropSquared"]<-df$Prop^2
df_line[,"D"]<-aggregate(list(D=df$PropSquared),list(Line=df$Line),sum)$D
df_line[,"Simpson"]<-1-df_line[,"D"]

#shannon Index (adding log function as per instructions)
df[,"LogShannon"]<-ifelse(df$Prop>0,df$Prop*log(df$Prop),0)
df_line[,"Shannon"]<--aggregate(list(H=df$LogShannon),list(Line=df$Line),sum)$H

#indentifies area of column
df_line$StudyArea<-"Esmerald"

#stores the averages for Esmerald to use in the final table
EsmeraldShannon<-mean(df_line$Shannon)
EsmeraldSimpson<-mean(df_line$Simpson)
EsmeraldDBHmean<-mean(df_line$DBHmean)

#saves df_line so it doesn't get overwritten by Ducke
Esmerald_df_line<-df_line


#ducke Data
df<-read_excel("DUCKE.xlsx")

## New names:
## • `voucher` -> `voucher...8`
## • `voucher` -> `voucher...9`
## • `voucher` -> `voucher...10`
## • `voucher` -> `voucher...11`
## • `voucher` -> `voucher...12`
## • `STEMDBH` -> `STEMDBH...15`
## • `STEMDBH` -> `STEMDBH...16`
## • `STEMDBH` -> `STEMDBH...17`
## • `STEMDBH` -> `STEMDBH...18`
## • `STEMDBH` -> `STEMDBH...19`
## • `STEMDBH` -> `STEMDBH...20`
## • `STEMDBH` -> `STEMDBH...21`
## • `STEMDBH` -> `STEMDBH...22`
## • `STEMDBH` -> `STEMDBH...23`
## • `STEMDBH` -> `STEMDBH...24`

df$Line<-as.numeric(df$Line)
df<-df[order(df$Line),]

#cleans up missing values
df<-df[!is.na(df[,"N(IND)"]),]
df_line<-aggregate(df[,"N(IND)"],list(Line=df$Line),sum)
names(df_line)[2]<-"N(IND)"
dbh_cols<-grep("STEMDBH",colnames(df))

#set the columns to numeric
df[dbh_cols]<-lapply(df[dbh_cols],as.numeric)

#calculations
df[,"DBHsum"]<-rowSums(df[,dbh_cols],na.rm=TRUE)
df_line[,"DBHsum"]<-aggregate(df$DBHsum,list(df$Line),sum)$x
df_line[,"DBHmean"]<-df_line[,"DBHsum"]/df_line[,"N(IND)"]

#outputs the barplot
barplot(DBHmean~Line,data=df_line,main="Ducke Variation",col="darkgreen",ylim=c(0,14))

#Ducke diversity calculations 
df[,"Prop"]<-df[,"N(IND)"]/df_line[match(df$Line,df_line$Line),"N(IND)"]

#Simpson Index
df[,"PropSquared"]<-df$Prop^2
df_line[,"D"]<-aggregate(list(D=df$PropSquared),list(Line=df$Line),sum)$D
df_line[,"Simpson"]<-1-df_line[,"D"]

#Shannon Index
df[,"LogShannon"]<-ifelse(df$Prop>0,df$Prop*log(df$Prop),0)
df_line[,"Shannon"]<--aggregate(list(H=df$LogShannon),list(Line=df$Line),sum)$H

#identifies the column area
df_line$StudyArea<-"Ducke"

#saves the data for Ducke for graphing purposes
DuckeShannon<-mean(df_line$Shannon)
DuckeSimpson<-mean(df_line$Simpson)
DuckeDBHmean<-mean(df_line$DBHmean)
Ducke_df_line<-df_line


#transect and dbh comparison
#combines the data for the transect table using base R rbind
CombinedTransectTable<-rbind(Esmerald_df_line[,c("StudyArea","Line","Shannon","Simpson")],Ducke_df_line[,c("StudyArea","Line","Shannon","Simpson")])

print(CombinedTransectTable)

##    StudyArea Line  Shannon   Simpson
## 1   Esmerald    1 3.092075 0.9464575
## 2   Esmerald    2 2.954003 0.9355102
## 3   Esmerald    3 3.407904 0.9600000
## 4   Esmerald    4 3.004227 0.9437500
## 5   Esmerald    5 2.375988 0.8731098
## 6   Esmerald    6 3.080476 0.9464923
## 7   Esmerald    7 2.846720 0.9323621
## 8   Esmerald    8 2.593866 0.8975069
## 9   Esmerald    9 2.531012 0.9005102
## 10  Esmerald   10 2.397267 0.8798186
## 11     Ducke    1 3.401197 0.9666667
## 12     Ducke    2 3.429486 0.9660494
## 13     Ducke    3 3.728961 0.9756625
## 14     Ducke    4 3.691537 0.9732911
## 15     Ducke    5 3.704663 0.9750567
## 16     Ducke    6 3.344549 0.9635796
## 17     Ducke    7 3.122425 0.9519890
## 18     Ducke    8 3.431665 0.9479384
## 19     Ducke    9 3.271689 0.9607610
## 20     Ducke   10 3.258097 0.9615385

#reset the plot window for a single output
par(mfrow=c(1,1))

#as per (Geoff's :D) recommendation, adding a box plot for the DBH comparisons
boxplot(Esmerald_df_line$DBHmean,Ducke_df_line$DBHmean,names=c("Esmerald","Ducke"),col=c("lightgreen","darkgreen"),main="Distribution of Mean DBH by Transect",ylab ="Mean DBH in cm")

Part 2 Ingesting Weather Data:

#Esmerald
EsmeraldData<-read_excel("ESMWeather.xlsx")

#the first iteration of the code did not include the numbered brackets for the columns. I used the
#header names but for some reason this only worked once, and after re-running the code and also 
#running it through "source" it didn't work for some reason so I just directly reference call the
#column number in place of the actual names for the temperature and rain data for example.

#makes columns strictly numerical
EsmeraldData[[2]]<-as.numeric(EsmeraldData[[2]])
EsmeraldData[[3]]<-as.numeric(EsmeraldData[[3]])

#averages for Esmerald
EsmeraldAnnualTemp<-mean(EsmeraldData[[2]],na.rm=TRUE)
EsmeraldAnnualRain<-sum(EsmeraldData[[3]],na.rm=TRUE)

#Ducke
DuckeData<-read_excel("DUCWeather.xlsx")

#makes columns strictly numerical
DuckeData[[2]]<-as.numeric(DuckeData[[2]])
DuckeData[[3]]<-as.numeric(DuckeData[[3]])

#averages for Ducke
DuckeAnnualTemp<-mean(DuckeData[[2]],na.rm=TRUE)
DuckeAnnualRain<-sum(DuckeData[[3]],na.rm =TRUE)

#values for Shannon and Simpson index are calculated dynamically from the first part
CombinedData<-data.frame(Area=c("Esmerald","Ducke"),
Shannon=c(EsmeraldShannon,DuckeShannon),
Simpson=c(EsmeraldSimpson,DuckeSimpson),
MeanDBH=c(EsmeraldDBHmean,DuckeDBHmean),
AvgerageAnnTemp=c(EsmeraldAnnualTemp,DuckeAnnualTemp),
TotalAnnualRain=c(EsmeraldAnnualRain,DuckeAnnualRain))

#prints out the combined distribution plot for esmerald and ducke
print(CombinedData)

##       Area  Shannon   Simpson  MeanDBH AvgerageAnnTemp TotalAnnualRain
## 1 Esmerald 2.828354 0.9215518 7.841486        81.41667            38.5
## 2    Ducke 3.438427 0.9642533 9.001136        82.16667            69.3

Part 3 Finalization of the Graphs:

#finalization of the climate profiles for both areas to conclude which is more
#biodiverse and how rainfall, temperature, and mean DBH play into this

#for this portion I wanted to use more complex libraries to make fun plots
#here I use tidyr, and ggplot2

#renames the columns in the dataframe
colnames(CombinedData)<-c("StudyArea","Shannon Diversity Index","Simpson Diversity Index","Mean DBH in cm","Average Annual Temp in F","Total Annual Rain in Inches")

#reshapes the data so we can facet everything into one snapshot
plot_data<-pivot_longer(CombinedData,cols=-StudyArea,names_to="Metric",values_to="Value")

#makes a basic lollipop plot, geom_segment defines the line that carries the heavier top
#(named it Lolliplot because it sounds fun)
#portion of the line, and geom_point makes the tip of the line show up on the graphs
#to make sure that each plot is seperate I looked into facet_wrap which does this, and defined it 
#by metric and scaled it by y axis so that the y-axis scales are all independent and thus can be
#graphed separately
#made titles and colors for the lollipops so that it is visualized intuitively and in a way that
#can be read and interpreted easily.
#liked the theme as opposed to other ones from light, dark, etc.

Lolliplot<-ggplot(plot_data, aes(x = StudyArea, y = Value, color = StudyArea))+geom_segment(aes(x =StudyArea,xend=StudyArea,y=0,yend=Value),linewidth=2)+geom_point(size=6)+
#this facets the y axis independently so the values define how high the y axis can go for each graph
facet_wrap(~Metric, scales = "free_y")+
  
#wanted to use lollipop-adjacent colors for visualization
scale_color_manual(values=c("Esmerald"="pink","Ducke"="lightblue"))+
  
#gave a title and subtitle to make it more informational
labs(title="Biodiversity & Climate Profiles",subtitle="Esmerald, Ecuador as Compared to Ducke, Brazil",x ="",y ="")+theme_minimal() 
#I left x and y blank since it gives unnecessary identifiers for the entire page, it looks cleaner without

#prints the lollipop plots
print(Lolliplot)

#I edited the output in the report since it looks nicer that way but the output data itself is the same

R Final Project Assignment

Maximilian Luetz

4/27/2026

Part 1 Variation Graphs:

Part 2 Ingesting Weather Data:

Part 3 Finalization of the Graphs: