R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

###########################################################################
# Library
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(stringr)
library(purrr)
library(ggplot2)
#install.packages('ggthemes')
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.5.3
###########################################################################
# Cleaning air temperature data

# folder containing files
folder_path <- "request/"

# list all csv files
files <- list.files(
  folder_path,
  pattern = "^2022(0[8-9]|1[0-2])\\.csv$|^2023(0[1-9]|1[0-2])\\.csv$|^202401
  \\.csv$",
  full.names = TRUE
)

# read and combine them
combined_data <- files %>%
  lapply(read_csv) %>%
  bind_rows()
## Rows: 31 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 30 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 31 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 30 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 31 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 31 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 28 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 31 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 30 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 31 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 30 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 31 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 31 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 30 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 31 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 30 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 31 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station, time
## dbl (6): latitude [degrees_north], longitude [degrees_east], elevation [feet...
## lgl (3): temp_9m_max [kelvin], temp_9m_min [kelvin], temp_9m_avg [kelvin]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# remove timezone and convert date column to dates the r can process
data_clean <- combined_data %>%
  mutate(
    time = str_remove(time, " (EDT|EST)$"),  
    time = ymd(time)                         
  )

# Add week column
data_clean_w <- data_clean %>%
  mutate(week = floor_date(time, "week"))

# Create average temperature column grouped by week
air_temp_average_w <- data_clean_w %>%
  group_by(week) %>%
  summarise(avg_temp = mean(`temp_2m_avg [kelvin]`, na.rm = TRUE))

# Anomaly (week)
air_temp_average_w <- air_temp_average_w %>%
  mutate(anomaly = avg_temp - mean(avg_temp, na.rm = TRUE))

#Rename anomaly column and delete average temp column (week)
Air_Temp_Anom_w <- air_temp_average_w %>% 
  rename(AT_Anomaly_w = anomaly)
#########################################################################
# Cleaning settlement data

settlement_data <- read_csv('Height_Readings_Date.csv')
## Rows: 1027 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Time
## dbl (1): EWE-TP-056A - Height
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert date column to dates that R can read
settlement_data_clean <- settlement_data %>%
  mutate(
    Time=mdy(Time)
  )

# Add week column
settlement_data_clean_w <- settlement_data_clean %>%
  mutate(week = floor_date(Time,"week"))

# Add average settlement reading column grouped by week
Settlement_Data_Average_w <- settlement_data_clean_w %>%
  group_by(week) %>%
  summarise(Settlement = mean(`EWE-TP-056A - Height`, na.rm = TRUE))
#########################################################################
# Create one table with all temperature and settlement data (week)
Combined_Table_W <- reduce(list(Air_Temp_Anom_w, 
                              Settlement_Data_Average_w), 
                         full_join, by = "week")

# Identify scale factor in order to plot everything on the same graph (week)
scale_factor_w <- max(Combined_Table_W$AT_Anomaly_w, 
                    na.rm = TRUE) /
                  max(Combined_Table_W$Settlement, 
                    na.rm = TRUE)

# Dual axis Plot
ggplot(Combined_Table_W, aes(x = week)) +
  geom_line(aes(y = AT_Anomaly_w, 
                color = "Temperature"), 
            linewidth = 0.75) +
  geom_line(aes(y = Settlement * scale_factor_w, 
                color = "Settlement"),
            linetype = "dashed", linewidth = 1) +
  scale_y_continuous(name = "Normalized Temperature (K)",
    sec.axis = sec_axis(~ . / scale_factor_w, name = "Settlement (in)")) +
  labs(x = "Date",
      color = "Legend",
      title = "Time Series Plot of Normalized Temperature and Settlement Data"
      ) +
  
  theme_minimal()
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).

# The product shows that there is an inverse relationship between temperature
# settlement
############################################################################
model <- lm(Settlement ~ avg_temp, 
            data = Combined_Table_W, 
            na.action = na.exclude)
Combined_Table_W$settlement_detrended <- resid(model)

ggplot(Combined_Table_W, aes(x = week, y = Settlement)) +
  geom_line(aes(y = Settlement, color = "Raw data"),
            linetype = "dashed") +
  geom_line(aes(y = settlement_detrended, color = "Detrended data")) +
  labs(color = "Legend",
       x = "Date",
       title = "Settlement Time Series with Temperature Effect Removed" ) +
  theme_minimal()
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).

# The product here shows that we can extract the temperature effect from the
# data and reveal real settlement trends
##############################################################################
# Settlement vs temperature plot
ggplot(Combined_Table_W, aes(x = avg_temp, y = Settlement)) +
  geom_point(aes(color = "Settlement"),
    size = 2) +
  geom_smooth(method = "lm", 
              color = "red", 
              linewidth = 1) +
  labs(x = "Temperature",
       y = "Settlement",
       title = "Temperature vs Setllement Data") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).

# The product here shows a scatter plot, confirming the relationship between
# temperature and settlement. I tried assigning each dot a separate colour based
# on the month as discussed with Professor Frei but could not find a relationship