This code is for an EPS 139 final project and analyzes historic thermometer and temperature-at-depth data. Generative AI from ChatGPT3.5 assisted in the making of this code. This code is sequential: products from previous chunks are used in subsequent ones.
—- PRELIMINARIES AND DATA UPLOADS —- Uploading all data from “DATA” folder.
Adding to library
# install all possible necessary libraries
library(ape)
library(car)
## Loading required package: carData
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:ape':
##
## where
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(dunn.test)
library(FSA)
## Registered S3 methods overwritten by 'FSA':
## method from
## confint.boot car
## hist.boot car
## ## FSA v0.9.5. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.
##
## Attaching package: 'FSA'
## The following object is masked from 'package:car':
##
## bootCase
library(ForestGapR)
library(ForestTools)
library(foreach)
library(ggplot2)
library(ggpubr)
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:ape':
##
## rotate
library(ggsignif)
library(lattice)
library(lidR)
library(multcompView)
library(raster)
## Loading required package: sp
##
## Attaching package: 'raster'
## The following objects are masked from 'package:lidR':
##
## projection, projection<-
## The following object is masked from 'package:dplyr':
##
## select
library(readr)
library(RColorBrewer)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
##
## Attaching package: 'sf'
## The following object is masked from 'package:lidR':
##
## st_concave_hull
library(spdep)
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Registered S3 method overwritten by 'spdep':
## method from
## plot.mst ape
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:raster':
##
## extract
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::accumulate() masks foreach::accumulate()
## ✖ tidyr::extract() masks raster::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ raster::select() masks dplyr::select()
## ✖ purrr::some() masks car::some()
## ✖ purrr::when() masks foreach::when()
## ✖ dplyr::where() masks ape::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(vegan)
## Loading required package: permute
## This is vegan 2.6-4
library(viridis)
## Loading required package: viridisLite
Upload data
# set working directory
setwd("/Users/beatriceyoud/Desktop/139/Challenger/")
# all data thermometer anoms
TE <- read_csv("LLTh.csv") # read in data
## New names:
## Rows: 430 Columns: 15
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (12): Previous Thermometer, ...2, ...3, Thermometer of interest, ...5, .... dbl
## (3): G_B, Long, Lat
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...15`
head(TE) # examine data
## # A tibble: 6 × 15
## `Previous Thermometer` ...2 ...3 Thermometer of interes…¹ ...5 ...6 ...7
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Temperature Depth Number Temperature Depth Numb… Corr…
## 2 55.5 130 36 55.8 160 42 <NA>
## 3 63.8 60 31 64 70 31 <NA>
## 4 45.5 600 37 46.5 700 34 <NA>
## 5 39.5 900 30 39.8 1000 39 <NA>
## 6 39.5 900 30 39.8 1000 39 <NA>
## # ℹ abbreviated name: ¹`Thermometer of interest`
## # ℹ 8 more variables: `Next thermometer` <chr>, ...9 <chr>, ...10 <chr>,
## # G_B <dbl>, Long <dbl>, Lat <dbl>, Date <chr>, ...15 <chr>
# ensure thermometers are in dataframe
TES <- data.frame(TE)
# all data on thermometer appearances
THTall <- read_csv("EPS 139 Thermometers - Thermometer tallies .csv.csv") # read in data
## Rows: 4934 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Number
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(THTall) # examine data
## # A tibble: 6 × 1
## Number
## <chr>
## 1 47
## 2 54
## 3 54
## 4 36
## 5 67
## 6 39
—- ANALYSIS USED IN PAPER —-
Unique thermometers
# make a new dataframe for all thermometers
thermoms <- data.frame(TE$...6)
# get the list of the unique thermometers
UT <- unique(thermoms)
Residuals (warnings of NAs show up, disregard)
new_col_names <- unique(data.frame(UT$TE....6)) # get unique thermometers as column names
# new matrix for the unique thermometers with the new column names
UTs <- data.frame(matrix(NA, ncol = nrow(new_col_names)-1, nrow = 1))
#loop through each column to rename for unique thermometer names
for (i in 1:nrow(new_col_names)-1) {
colnames(UTs)[i] <- new_col_names[i+1,1]
}
# initialize new dataframes with unique thermometer columns
DepUTs <- UTs
thermNUM <- UTs
LongTh <- UTs
LatTh <- UTs
GBb <- UTs
SL <- data.frame(UTs)
# loop through each unique thermometer
for (i in 1:56) {
TOI <- UT[i+1,1] # get thermometer of interest
rows_with_TOI <- TES[, 6] == (TOI) # find rows in thermometer anomaly with unqiue thermometer
TOIs <- TES[rows_with_TOI, ] # pull all information from the thermometers of interest
Res_values <- c() # residual values
for (t in 1:nrow(TOIs)) { #loop through thermometers of interest
# save all values of interest
PT <- c(as.numeric(TOIs[t,1]),as.numeric(TOIs[t,2])) # previous
AT <- c(as.numeric(TOIs[t,8]),as.numeric(TOIs[t,9])) # after
Ts <- c(as.numeric(TOIs[t,4]),as.numeric(TOIs[t,5]), as.numeric(TOIs[t,6])) # Temp of interest
LL <- c(as.numeric(TOIs[t,12]), as.numeric(TOIs[t,13])) # lat and long
GB <- as.numeric(TOIs[t,11])
# combining into dataframe for x,y
points_df <- data.frame(x = c(PT[1], AT[1]), y = c(PT[2], AT[2]))
# linear model based on the beofre and after points
linear_model <- lm(x ~ y, data = points_df)
# predict values for point temp of interest
predicted_value <- predict(linear_model, newdata = data.frame(y = Ts[2]))
# Record anomaly
Res <- abs(predicted_value - Ts[1])
#print(Res) # a check
# concatenate resulting values by thermometer
Res_values <- c(Res_values, Res)
Residuals <- as.numeric(Res_values)
#print(Residuals)
UTs[t,i] <- Res
DepUTs[t,i] <- Ts[2]
thermNUM[t,i] <- Ts[3]
LongTh[t,i] <- LL[1]
LatTh[t,i] <-LL[2]
GBb[t,i] <- GB[1]
SL[1,i] <- -99
# Fit a linear regression model to residuals against time
if (length(Res_values) > 0) {
time <- 1:length(Res_values)
lm_model <- try(lm(Res_values ~ time))
if (!inherits(lm_model, "try-error")) {
# get slope
slope <- coef(lm_model)[2]
# if the slope is positive or negative
if (!is.na(slope)) {
if (slope > 0) {
SL[1, i] <- 1
} else if (slope < 0) {
SL[1, i] <- -1
} else {
SL[1, i] <- 0
}
}
}
}
}
}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning in predict.lm(linear_model, newdata = data.frame(y = Ts[2])):
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning in predict.lm(linear_model, newdata = data.frame(y = Ts[2])):
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning in predict.lm(linear_model, newdata = data.frame(y = Ts[2])):
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning: NAs introduced by coercion
## Warning in predict.lm(linear_model, newdata = data.frame(y = Ts[2])):
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(linear_model, newdata = data.frame(y = Ts[2])):
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning: NAs introduced by coercion
## Warning in predict.lm(linear_model, newdata = data.frame(y = Ts[2])):
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
Thermometers better or worse over time?
# Transposing
SL_column <- as.data.frame(t(SL[1,]))
# Renaming
colnames(SL_column) <- "SL"
# Sorting
SL_sorted <- SL_column[order(rownames(SL_column)), , drop = FALSE]
# Tally the independent numbers
tally <- table(SL_sorted$SL)
#print(SL_sorted) #as a check
#print(tally) #as a check
tally[2:3] # view observed frequencies
##
## -1 1
## 22 23
# and write them
observed <- c(22, 23)
# expected frequencies (assuming 50:50 which is 22.5 and 22.5)
expected <- c(22.5, 22.5)
# get chi-squared statistic
chi_squared <- sum((observed - expected)^2 / expected)
# and calculate dfs
df <- length(observed) - 1
# and p value
p_value <- pchisq(chi_squared, df, lower.tail = FALSE)
# View statistic and the p-value
print(chi_squared)
## [1] 0.02222222
print(p_value)
## [1] 0.8814975
Plot residual anom by depth
# Initialize list
data_list <- list()
# Loop through each column
for (i in 1:ncol(UTs)) {
# pull out UTs and Depth for the current column
D <- data.frame(UTs = UTs[, i], Depth = DepUTs[, i], Names = names(UTs)[i])
# Remove rows with NA values
D <- na.omit(D)
# Append the data frame to the list
data_list[[i]] <- D
# print(names(UTs)[i]) #check
}
# Combine data frames to list by row binding
merged_data <- do.call(rbind, data_list)
#print(merged_data) # a check
# color assignments with viridis color scheme
colors <- viridis(length(unique(merged_data$Names)))
# for saving figure
#pdf("plot_merged_data.pdf")
# Points plot
plot(merged_data$Depth, merged_data$UTs, col = colors[as.numeric(factor(merged_data$Names))], xlab = "Depth", ylab = "Residuals")
# legend
legend("topright", legend = levels(factor(merged_data$Names)), fill = colors, title = "Thermometer", cex = 1, ncol = 4)
#dev.off()
# model selection steps
step_model <- step(lm(UTs ~ Depth, data = merged_data), direction = "both")
## Start: AIC=727.75
## UTs ~ Depth
##
## Df Sum of Sq RSS AIC
## <none> 2318.2 727.75
## - Depth 1 21.524 2339.7 729.72
summary(step_model) # view model
##
## Call:
## lm(formula = UTs ~ Depth, data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4675 -0.9000 -0.5828 0.0604 21.1072
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.5287616 0.1712032 8.930 <2e-16 ***
## Depth -0.0003062 0.0001538 -1.991 0.0471 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.33 on 427 degrees of freedom
## Multiple R-squared: 0.009199, Adjusted R-squared: 0.006879
## F-statistic: 3.965 on 1 and 427 DF, p-value: 0.0471
View errors & error ratios by thermometer
# Create empty data frame with column names fore each thermometer name/number
utall <- data.frame(unique(THTall))
subset_data <- data.frame(UTALone = character(), AllTallies = integer(), ResidTallies = integer())
# loop through thermometers and get correct numbers and error numbers
for (i in 1:81) {
UTALone <- utall[i, 1]
subset_data_i <- merged_data[merged_data[, 3] == UTALone, ]
ResidTallies <- nrow(subset_data_i)
AllTallies <- nrow(THTall[THTall[, 1] == UTALone, ])
subset_data_i <- data.frame(UTALone = UTALone, AllTallies = AllTallies, ResidTallies = ResidTallies)
subset_data <- rbind(subset_data, subset_data_i)
}
# calculate the ratio of correct and incorrect
subset_data$Ratio <- subset_data$ResidTallies / subset_data$AllTallies
# Plot therm ratio by thermometer
plot(subset_data$Ratio, xlab = "Thermometer index", ylab = "Ratio of thermometers with error", ylim = c(0, 1.1))
#plot(subset_data$AllTallies~subset_data$ResidTallies) # another way to look at error
library(viridis)
# Create the plot
plot(subset_data$ResidTallies, subset_data$AllTallies,
xlab = "Error Thermometer Counts", ylab = "All Thermometer Counts",
col = viridis(length(unique(subset_data$UTALone))),
pch = 16, cex = 0.7)
# Add legend with four columns... I ended up putting this in manually
#legend("bottomright", legend = levels(factor(subset_data$UTALone)),
# col = viridis(length(unique(subset_data$UTALone))), pch = 16, cex = 0.35, title = #"UTALone",
# ncol = 4)
# Fit a linear model
fit <- lm(AllTallies ~ ResidTallies, data = subset_data)
# Print summary statistics
summary(fit)
##
## Call:
## lm(formula = AllTallies ~ ResidTallies, data = subset_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -125.932 -20.472 -13.732 4.688 200.688
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.7318 7.0414 2.092 0.0396 *
## ResidTallies 8.7400 0.7355 11.883 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52.85 on 79 degrees of freedom
## Multiple R-squared: 0.6412, Adjusted R-squared: 0.6367
## F-statistic: 141.2 on 1 and 79 DF, p-value: < 2.2e-16
K Mean Clustering
# Add a new column with row numbers (needed bc thermometer labels have letters)
subset_data$Row_Number <- row.names(subset_data)
# Print updated dataset
# print(subset_data) # a check
#new for clustering with just ratios
subset_dataA <- subset_data[,4]
# Perform K-means clustering
kmeans_result <- kmeans(subset_dataA, centers = 4) # centers can be adjusted
# Plot with clusters
plot(subset_data$Ratio, col = kmeans_result$cluster, xlab = "Thermometer index", ylab = "Ratio of thermometers with error", ylim = c(0, 1.1),)
legend("topright", legend = unique(kmeans_result$cluster), col = 1:length(unique(kmeans_result$cluster)), pch = 1)
# Append cluster assignments to subset_dataA
subset_data$Cluster <- kmeans_result$cluster
subset_cluster_1 <- subset_data[subset_data$Cluster == 1, ]
#max(subset_cluster_1$Ratio) # is 0.1764706
Elbow method
# loop through clusters. Placed cap at 10
wcss <- vector()
for (i in 1:10) {
kmeans_result <- kmeans(subset_dataA, centers = i)
wcss[i] <- sum(kmeans_result$withinss) # calc wcss
}
# Plot Elbow Method
plot(1:10, wcss, type = "b", xlab = "N Clusters", ylab = "WCSS",
main = "Elbow Method")
# first differences of WCSS
diff_wcss <- c(0, diff(wcss))
Histograms of all therms
ggplot(subset_data, aes(x=factor(UTALone), y=AllTallies)) +
geom_col(fill="black") +
labs(title="Tally counts for all thermometers", x="Thermometer", y="Tally")
#ggsave("plot.pdf", device = "pdf") # for saving