This code is for an EPS 139 final project and analyzes historic thermometer and temperature-at-depth data. Generative AI from ChatGPT3.5 assisted in the making of this code. This code is sequential: products from previous chunks are used in subsequent ones.

—- PRELIMINARIES AND DATA UPLOADS —- Uploading all data from “DATA” folder.

Adding to library

# install all possible necessary libraries 
library(ape)
library(car)
## Loading required package: carData
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following object is masked from 'package:ape':
## 
##     where
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(dunn.test)
library(FSA)
## Registered S3 methods overwritten by 'FSA':
##   method       from
##   confint.boot car 
##   hist.boot    car
## ## FSA v0.9.5. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.
## 
## Attaching package: 'FSA'
## The following object is masked from 'package:car':
## 
##     bootCase
library(ForestGapR)
library(ForestTools)
library(foreach)
library(ggplot2)
library(ggpubr)
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:ape':
## 
##     rotate
library(ggsignif)
library(lattice)
library(lidR)
library(multcompView)
library(raster)
## Loading required package: sp
## 
## Attaching package: 'raster'
## The following objects are masked from 'package:lidR':
## 
##     projection, projection<-
## The following object is masked from 'package:dplyr':
## 
##     select
library(readr)
library(RColorBrewer)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
## 
## Attaching package: 'sf'
## The following object is masked from 'package:lidR':
## 
##     st_concave_hull
library(spdep)
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Registered S3 method overwritten by 'spdep':
##   method   from
##   plot.mst ape
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:raster':
## 
##     extract
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::accumulate() masks foreach::accumulate()
## ✖ tidyr::extract()    masks raster::extract()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::recode()     masks car::recode()
## ✖ raster::select()    masks dplyr::select()
## ✖ purrr::some()       masks car::some()
## ✖ purrr::when()       masks foreach::when()
## ✖ dplyr::where()      masks ape::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(vegan)
## Loading required package: permute
## This is vegan 2.6-4
library(viridis)
## Loading required package: viridisLite

Upload data

# set working directory
setwd("/Users/beatriceyoud/Desktop/139/Challenger/") 

# all data thermometer anoms
TE <- read_csv("LLTh.csv") # read in data
## New names:
## Rows: 430 Columns: 15
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (12): Previous Thermometer, ...2, ...3, Thermometer of interest, ...5, .... dbl
## (3): G_B, Long, Lat
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...15`
head(TE) # examine data
## # A tibble: 6 × 15
##   `Previous Thermometer` ...2  ...3   Thermometer of interes…¹ ...5  ...6  ...7 
##   <chr>                  <chr> <chr>  <chr>                    <chr> <chr> <chr>
## 1 Temperature            Depth Number Temperature              Depth Numb… Corr…
## 2 55.5                   130   36     55.8                     160   42    <NA> 
## 3 63.8                   60    31     64                       70    31    <NA> 
## 4 45.5                   600   37     46.5                     700   34    <NA> 
## 5 39.5                   900   30     39.8                     1000  39    <NA> 
## 6 39.5                   900   30     39.8                     1000  39    <NA> 
## # ℹ abbreviated name: ¹​`Thermometer of interest`
## # ℹ 8 more variables: `Next thermometer` <chr>, ...9 <chr>, ...10 <chr>,
## #   G_B <dbl>, Long <dbl>, Lat <dbl>, Date <chr>, ...15 <chr>
# ensure thermometers are in dataframe
TES <- data.frame(TE)

# all data on thermometer appearances 
THTall <- read_csv("EPS 139 Thermometers  - Thermometer tallies .csv.csv") # read in data
## Rows: 4934 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(THTall) # examine data
## # A tibble: 6 × 1
##   Number
##   <chr> 
## 1 47    
## 2 54    
## 3 54    
## 4 36    
## 5 67    
## 6 39

—- ANALYSIS USED IN PAPER —-

Unique thermometers

# make a new dataframe for all thermometers
thermoms <- data.frame(TE$...6)
# get the list of the unique thermometers
UT <- unique(thermoms)

Residuals (warnings of NAs show up, disregard)

new_col_names <- unique(data.frame(UT$TE....6)) # get unique thermometers as column names

# new matrix for the unique thermometers with the new column names
UTs <- data.frame(matrix(NA, ncol = nrow(new_col_names)-1, nrow = 1))

#loop through each column to rename for unique thermometer names
for (i in 1:nrow(new_col_names)-1) {
colnames(UTs)[i] <-  new_col_names[i+1,1]
}

# initialize new dataframes with unique thermometer columns
DepUTs <- UTs 
thermNUM <- UTs
LongTh <- UTs
LatTh <- UTs
GBb <- UTs
SL <- data.frame(UTs)

# loop through each unique thermometer 
for (i in 1:56) {
TOI <- UT[i+1,1] # get thermometer of interest
rows_with_TOI <- TES[, 6] == (TOI) # find rows in thermometer anomaly with unqiue thermometer
TOIs <- TES[rows_with_TOI, ] # pull all information from the thermometers of interest

Res_values <- c() # residual values

for (t in 1:nrow(TOIs)) { #loop through thermometers of interest
  # save all values of interest
PT <- c(as.numeric(TOIs[t,1]),as.numeric(TOIs[t,2])) # previous
AT <- c(as.numeric(TOIs[t,8]),as.numeric(TOIs[t,9])) # after
Ts <- c(as.numeric(TOIs[t,4]),as.numeric(TOIs[t,5]),  as.numeric(TOIs[t,6])) # Temp of interest
LL <- c(as.numeric(TOIs[t,12]), as.numeric(TOIs[t,13])) # lat and long
GB <- as.numeric(TOIs[t,11])

# combining into dataframe for x,y
points_df <- data.frame(x = c(PT[1], AT[1]), y = c(PT[2], AT[2]))
# linear model based on the beofre and after points
linear_model <- lm(x ~ y, data = points_df)
# predict values for point temp of interest
predicted_value <- predict(linear_model, newdata = data.frame(y = Ts[2]))

# Record anomaly
Res <- abs(predicted_value - Ts[1])
#print(Res) # a check

# concatenate resulting values by thermometer
Res_values <- c(Res_values, Res)
Residuals <- as.numeric(Res_values)
#print(Residuals)

UTs[t,i] <- Res
DepUTs[t,i] <- Ts[2]
thermNUM[t,i] <- Ts[3]
LongTh[t,i] <- LL[1]
LatTh[t,i] <-LL[2]
GBb[t,i] <- GB[1]
SL[1,i] <- -99

# Fit a linear regression model to residuals against time
      if (length(Res_values) > 0) {
        time <- 1:length(Res_values)
        lm_model <- try(lm(Res_values ~ time)) 
        if (!inherits(lm_model, "try-error")) {
          # get slope
          slope <- coef(lm_model)[2]
          
          # if the slope is positive or negative
          if (!is.na(slope)) {
            if (slope > 0) {
              SL[1, i] <- 1
            } else if (slope < 0) {
              SL[1, i] <- -1
            } else {
              SL[1, i] <- 0
            }
          }
        }
}
}
}
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
## Warning in predict.lm(linear_model, newdata = data.frame(y = Ts[2])):
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
## Warning in predict.lm(linear_model, newdata = data.frame(y = Ts[2])):
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
## Warning in predict.lm(linear_model, newdata = data.frame(y = Ts[2])):
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning: NAs introduced by coercion
## Warning in predict.lm(linear_model, newdata = data.frame(y = Ts[2])):
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(linear_model, newdata = data.frame(y = Ts[2])):
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning: NAs introduced by coercion
## Warning in predict.lm(linear_model, newdata = data.frame(y = Ts[2])):
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

Thermometers better or worse over time?

# Transposing 
SL_column <- as.data.frame(t(SL[1,]))
# Renaming
colnames(SL_column) <- "SL"
# Sorting
SL_sorted <- SL_column[order(rownames(SL_column)), , drop = FALSE]
# Tally the independent numbers
tally <- table(SL_sorted$SL)

#print(SL_sorted) #as a check
#print(tally) #as a check

tally[2:3] # view observed frequencies
## 
## -1  1 
## 22 23
# and write them
observed <- c(22, 23)

# expected frequencies (assuming 50:50 which is 22.5 and 22.5)
expected <- c(22.5, 22.5)

# get chi-squared statistic
chi_squared <- sum((observed - expected)^2 / expected)
# and calculate dfs
df <- length(observed) - 1
# and p value
p_value <- pchisq(chi_squared, df, lower.tail = FALSE)
# View statistic and the p-value
print(chi_squared)
## [1] 0.02222222
print(p_value)
## [1] 0.8814975

Plot residual anom by depth

# Initialize list
data_list <- list()

# Loop through each column
for (i in 1:ncol(UTs)) {
  # pull out UTs and Depth for the current column
  D <- data.frame(UTs = UTs[, i], Depth = DepUTs[, i], Names = names(UTs)[i])
  # Remove rows with NA values
  D <- na.omit(D)
  # Append the data frame to the list
  data_list[[i]] <- D
 # print(names(UTs)[i]) #check
}

# Combine data frames to list by row binding
merged_data <- do.call(rbind, data_list)
#print(merged_data) # a check

# color assignments with viridis color scheme
colors <- viridis(length(unique(merged_data$Names)))

# for saving figure
#pdf("plot_merged_data.pdf")

# Points plot
plot(merged_data$Depth, merged_data$UTs, col = colors[as.numeric(factor(merged_data$Names))], xlab = "Depth", ylab = "Residuals")

# legend
legend("topright", legend = levels(factor(merged_data$Names)), fill = colors, title = "Thermometer", cex = 1, ncol = 4)

#dev.off()

# model selection steps
step_model <- step(lm(UTs ~ Depth, data = merged_data), direction = "both")
## Start:  AIC=727.75
## UTs ~ Depth
## 
##         Df Sum of Sq    RSS    AIC
## <none>               2318.2 727.75
## - Depth  1    21.524 2339.7 729.72
summary(step_model) # view model
## 
## Call:
## lm(formula = UTs ~ Depth, data = merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4675 -0.9000 -0.5828  0.0604 21.1072 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.5287616  0.1712032   8.930   <2e-16 ***
## Depth       -0.0003062  0.0001538  -1.991   0.0471 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.33 on 427 degrees of freedom
## Multiple R-squared:  0.009199,   Adjusted R-squared:  0.006879 
## F-statistic: 3.965 on 1 and 427 DF,  p-value: 0.0471

View errors & error ratios by thermometer

# Create empty data frame with column names fore each thermometer name/number
utall <- data.frame(unique(THTall))
subset_data <- data.frame(UTALone = character(), AllTallies = integer(), ResidTallies = integer())  

# loop through thermometers and get correct numbers and error numbers
for (i in 1:81) {
  UTALone <- utall[i, 1]
  subset_data_i <- merged_data[merged_data[, 3] == UTALone, ]
  ResidTallies <- nrow(subset_data_i)
  AllTallies <- nrow(THTall[THTall[, 1] == UTALone, ])
  subset_data_i <- data.frame(UTALone = UTALone, AllTallies = AllTallies, ResidTallies = ResidTallies)
  subset_data <- rbind(subset_data, subset_data_i)
}

# calculate the ratio of correct and incorrect
subset_data$Ratio <- subset_data$ResidTallies / subset_data$AllTallies

# Plot therm ratio by thermometer
plot(subset_data$Ratio, xlab = "Thermometer index", ylab = "Ratio of thermometers with error", ylim = c(0, 1.1))

#plot(subset_data$AllTallies~subset_data$ResidTallies) # another way to look at error


library(viridis)

# Create the plot
plot(subset_data$ResidTallies, subset_data$AllTallies,
     xlab = "Error Thermometer Counts", ylab = "All Thermometer Counts", 
     col = viridis(length(unique(subset_data$UTALone))),
     pch = 16, cex = 0.7)

# Add legend with four columns... I ended up putting this in manually
#legend("bottomright", legend = levels(factor(subset_data$UTALone)), 
 #      col = viridis(length(unique(subset_data$UTALone))), pch = 16, cex = 0.35, title = #"UTALone",
 #      ncol = 4)

# Fit a linear model
fit <- lm(AllTallies ~ ResidTallies, data = subset_data)

# Print summary statistics
summary(fit)
## 
## Call:
## lm(formula = AllTallies ~ ResidTallies, data = subset_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -125.932  -20.472  -13.732    4.688  200.688 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   14.7318     7.0414   2.092   0.0396 *  
## ResidTallies   8.7400     0.7355  11.883   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52.85 on 79 degrees of freedom
## Multiple R-squared:  0.6412, Adjusted R-squared:  0.6367 
## F-statistic: 141.2 on 1 and 79 DF,  p-value: < 2.2e-16

K Mean Clustering

# Add a new column with row numbers (needed bc thermometer labels have letters)
subset_data$Row_Number <- row.names(subset_data)
# Print updated dataset
# print(subset_data) # a check

#new for clustering with just ratios
subset_dataA <- subset_data[,4]

# Perform K-means clustering
kmeans_result <- kmeans(subset_dataA, centers = 4)  # centers can be adjusted

# Plot with clusters
plot(subset_data$Ratio, col = kmeans_result$cluster, xlab = "Thermometer index", ylab = "Ratio of thermometers with error", ylim = c(0, 1.1),)

legend("topright", legend = unique(kmeans_result$cluster), col = 1:length(unique(kmeans_result$cluster)), pch = 1)

# Append cluster assignments to subset_dataA
subset_data$Cluster <- kmeans_result$cluster

subset_cluster_1 <- subset_data[subset_data$Cluster == 1, ]
#max(subset_cluster_1$Ratio) # is 0.1764706

Elbow method

# loop through clusters. Placed cap at 10
wcss <- vector()
for (i in 1:10) {
  kmeans_result <- kmeans(subset_dataA, centers = i)
  wcss[i] <- sum(kmeans_result$withinss) # calc wcss
}

# Plot Elbow Method
plot(1:10, wcss, type = "b", xlab = "N Clusters", ylab = "WCSS",
     main = "Elbow Method")

# first differences of WCSS
diff_wcss <- c(0, diff(wcss))

Histograms of all therms

ggplot(subset_data, aes(x=factor(UTALone), y=AllTallies)) +
  geom_col(fill="black") +
  labs(title="Tally counts for all thermometers", x="Thermometer", y="Tally")

#ggsave("plot.pdf", device = "pdf") # for saving