library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(randomForest)
Warning: package 'randomForest' was built under R version 4.4.2
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'

The following object is masked from 'package:dplyr':

    combine

The following object is masked from 'package:ggplot2':

    margin
library(neuralnet)
Warning: package 'neuralnet' was built under R version 4.4.2

Attaching package: 'neuralnet'

The following object is masked from 'package:dplyr':

    compute
library(caret)
Warning: package 'caret' was built under R version 4.4.2
Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift
library(survival)

Attaching package: 'survival'

The following object is masked from 'package:caret':

    cluster
library(ggplot2)
library(tidyr)
library(dplyr)

sharks <- read.csv("sharks.csv")
sharksub <- read.csv("sharksub.csv")
str(sharks)
'data.frame':   500 obs. of  10 variables:
 $ ID    : chr  "SH001" "SH002" "SH003" "SH004" ...
 $ sex   : chr  "Female" "Female" "Female" "Male" ...
 $ blotch: num  37.2 34.5 36.3 35.3 37.4 ...
 $ BPM   : int  148 158 125 161 138 126 166 135 132 127 ...
 $ weight: num  74.7 73.4 71.8 104.6 67.1 ...
 $ length: num  187 189 284 171 264 ...
 $ air   : num  37.7 35.7 34.8 36.2 33.6 ...
 $ water : num  23.4 21.4 20.1 21.6 21.8 ...
 $ meta  : num  64.1 73.7 54.4 86.3 108 ...
 $ depth : num  53.2 49.6 49.4 50.3 49 ...
summary(sharks)
      ID                sex                blotch           BPM       
 Length:500         Length:500         Min.   :30.78   Min.   :119.0  
 Class :character   Class :character   1st Qu.:34.16   1st Qu.:129.0  
 Mode  :character   Mode  :character   Median :35.05   Median :142.0  
                                       Mean   :35.13   Mean   :141.8  
                                       3rd Qu.:36.05   3rd Qu.:153.2  
                                       Max.   :40.08   Max.   :166.0  
     weight           length           air            water      
 Min.   : 65.10   Min.   :128.3   Min.   :33.00   Min.   :20.01  
 1st Qu.: 75.68   1st Qu.:172.0   1st Qu.:34.42   1st Qu.:21.55  
 Median : 87.82   Median :211.1   Median :35.43   Median :23.11  
 Mean   : 87.94   Mean   :211.0   Mean   :35.54   Mean   :23.02  
 3rd Qu.:100.40   3rd Qu.:251.8   3rd Qu.:36.71   3rd Qu.:24.37  
 Max.   :110.94   Max.   :291.0   Max.   :38.00   Max.   :25.99  
      meta            depth      
 Min.   : 50.03   Min.   :44.64  
 1st Qu.: 67.39   1st Qu.:48.90  
 Median : 82.45   Median :50.14  
 Mean   : 82.04   Mean   :50.14  
 3rd Qu.: 95.97   3rd Qu.:51.35  
 Max.   :112.45   Max.   :56.83  
str(sharksub)
'data.frame':   50 obs. of  4 variables:
 $ ID     : chr  "SH269" "SH163" "SH008" "SH239" ...
 $ sex    : chr  "Female" "Female" "Female" "Female" ...
 $ blotch1: num  36.1 33.4 36.3 35 35.7 ...
 $ blotch2: num  37.2 34.4 36.5 36 36.8 ...
summary(sharksub)
      ID                sex               blotch1         blotch2     
 Length:50          Length:50          Min.   :32.49   Min.   :33.47  
 Class :character   Class :character   1st Qu.:34.38   1st Qu.:35.31  
 Mode  :character   Mode  :character   Median :34.94   Median :35.94  
                                       Mean   :35.03   Mean   :35.96  
                                       3rd Qu.:35.90   3rd Qu.:36.78  
                                       Max.   :37.07   Max.   :38.18  
sharks_norm <- sharks %>%
  mutate(air = scale(air), water = scale(water))

pearson_corr <- cor(sharks$air, sharks$water, method = "pearson")
spearman_corr <- cor(sharks$air, sharks$water, method = "spearman")
cat("Pearson : ", pearson_corr, "\n")
Pearson :  -0.05524051 
cat("Spearman : ", spearman_corr, "\n")
Spearman :  -0.05637344 
pearson_test <- cor.test(sharks$air, sharks$water, method = "pearson")
print(pearson_test)

    Pearson's product-moment correlation

data:  sharks$air and sharks$water
t = -1.2346, df = 498, p-value = 0.2176
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.14224207  0.03260803
sample estimates:
        cor 
-0.05524051 
air_water_lm <- lm(water ~ air, data = sharks)
summary(air_water_lm)

Call:
lm(formula = water ~ air, data = sharks)

Residuals:
     Min       1Q   Median       3Q      Max 
-3.03472 -1.47563  0.09925  1.38700  3.06356 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 25.31781    1.86221  13.596   <2e-16 ***
air         -0.06465    0.05236  -1.235    0.218    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.67 on 498 degrees of freedom
Multiple R-squared:  0.003052,  Adjusted R-squared:  0.00105 
F-statistic: 1.524 on 1 and 498 DF,  p-value: 0.2176
r_squared <- summary(air_water_lm)$r.squared
print(paste("R^2:", round(r_squared, 3)))
[1] "R^2: 0.003"
ggplot(sharks, aes(x = air, y = water)) +
  geom_point(color = "blue", alpha = 0.6) +
  labs(title = "AIR TEMP vs. WATER TEMP",
       x = "AIR TEMP (°C)",
       y = "WATER TEMP (°C)") +
  theme_minimal()+
  theme(axis.text = element_text(size = 14),
        axis.title = element_text(size = 14))

# 添加回归线
ggplot(sharks, aes(x = air, y = water)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  labs(title = "AIR TEMP vs. WATER TEMP",
       x = "AIR TEMP (°C)",
       y = "WATER TEMP (°C)") +
  theme_minimal()+
  theme(axis.text = element_text(size = 14),
        axis.title = element_text(size = 14))
`geom_smooth()` using formula = 'y ~ x'

# 添加LOESS平滑曲线
ggplot(sharks, aes(x = air, y = water)) +
  geom_point(position = position_jitter(width = 0.2), size = 2,) +
  geom_smooth(method = "loess", color = "green") +
  labs(title = "AIR TEMP vs. WATER TEMP",
    x = "AIR TEMP (°C)",
       y = "WATER TEMP (°C)")+
  theme_minimal()+
  theme(axis.text = element_text(size = 14),
        axis.title = element_text(size = 14))
`geom_smooth()` using formula = 'y ~ x'

set.seed(123)
train_index <- createDataPartition(sharks$water, p = 0.8, list = FALSE)
train_data <- sharks[train_index, ]
test_data <- sharks[-train_index, ]

# 训练随机森林模型
rf_model <- randomForest(water ~ air, data = train_data, ntree = 500, importance = TRUE)

# 查看模型摘要
print(rf_model)

Call:
 randomForest(formula = water ~ air, data = train_data, ntree = 500,      importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 1

          Mean of squared residuals: 3.600593
                    % Var explained: -30.09
# 预测水温
rf_predictions <- predict(rf_model, newdata = test_data)

# 计算均方误差(MSE)
mse_rf <- mean((test_data$water - rf_predictions)^2)
print(paste("随机森林MSE:", round(mse_rf, 3)))
[1] "随机森林MSE: 3.37"
# 变量重要性
importance(rf_model)
     %IncMSE IncNodePurity
air 9.148128      944.7882
varImpPlot(rf_model)