setwd(“C:/Users/HomePC/OneDrive/Desktop/R project”) # Install necessary libraries (if not already installed) install.packages(“tidyverse”) # For data manipulation and visualization install.packages(“readxl”) # For reading Excel files # Load the required libraries library(tidyverse) library(readxl)

List all sheets in the Excel file

excel_sheets(“Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release.xlsx”) # Load a specific sheet from the Excel file shipwreck_data <- read_excel(“Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release.xlsx”, sheet = 1)

Preview the first few rows of the data

head(shipwreck_data)

List all files in the current working directory

list.files() # Load the Excel file using the correct name shipwreck_data <- read_excel(“Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”)

Preview the first few rows of the dataset

head(shipwreck_data)

List all sheet names in the Excel file

excel_sheets(“Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”) # Load the main dataset from the “GeoDatabase” sheet shipwreck_data <- read_excel(“Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”)

Preview the first few rows of the dataset

head(shipwreck_data) # Load the GeoDatabase sheet into R geo_data <- read_excel(“Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”)

Preview the first few rows of the data

head(geo_data) # Clean column names for better usability library(janitor) geo_data <- geo_data %>% clean_names()

Preview cleaned data

head(geo_data)

Clean and standardize column names

geo_data <- geo_data %>% clean_names() # Converts names to lowercase and replaces spaces/special characters with underscores

View the cleaned column names

colnames(geo_data) # Preview the first few rows of the cleaned data head(geo_data) # Remove columns with empty or missing names geo_data <- geo_data %>% select(where(~ !all(is.na(.))))

Preview the cleaned dataset

head(geo_data) # Reload the dataset, skipping the first row if necessary geo_data <- read_excel( “Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”, skip = 1 # Adjust this number based on where the actual data starts )

Clean column names

library(janitor) geo_data <- geo_data %>% clean_names()

Preview the cleaned column names and data

colnames(geo_data) head(geo_data) # Remove columns with all NA values or irrelevant names geo_data <- geo_data %>% select(where(~ !all(is.na(.)))) # Keeps columns with at least one non-NA value

Preview cleaned data

head(geo_data) # Reload the data, skipping more rows geo_data <- read_excel( “Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”, skip = 2 # Skip the first two rows )

Clean column names

geo_data <- geo_data %>% clean_names()

Preview the cleaned column names

colnames(geo_data) # Remove columns with all NA values or irrelevant names geo_data <- geo_data %>% select(where(~ !all(is.na(.)))) # Keeps columns with at least one non-NA value

Preview the dataset again

head(geo_data) # Reload the dataset and skip additional rows geo_data <- read_excel( “Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”, skip = 3 # Adjust to find the row where actual data starts )

Clean column names

geo_data <- geo_data %>% clean_names()

Preview the cleaned data

head(geo_data) # Remove irrelevant or empty columns geo_data <- geo_data %>% select(where(~ !all(is.na(.)) & !grepl(“^\.\.\.”, colnames(geo_data))))

Preview the dataset again

head(geo_data) # Rename columns manually if necessary geo_data <- geo_data %>% rename( name = “name”, # Replace with correct column names latitude = “lat”, # Replace as needed longitude = “long” # Replace as needed ) # Display all column names colnames(geo_data)

Reload the dataset, skipping more rows to reach actual headers

geo_data <- read_excel( “Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”, skip = 4 # Adjust the number of rows skipped if necessary )

Preview the first few rows

head(geo_data) # Load the data without headers to inspect its structure geo_data_raw <- read_excel( “Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”, col_names = FALSE # Prevent R from using the first row as headers )

Preview the first few rows

head(geo_data_raw) # Reload the dataset, skipping the rows we identified as extra geo_data <- read_excel( “Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”, skip = 2 # Adjust this based on where the actual data starts )

Clean column names for better usability

library(janitor) geo_data <- geo_data %>% clean_names()

Preview the cleaned dataset

head(geo_data) # Remove empty or irrelevant columns geo_data <- geo_data %>% select(where(~ !all(is.na(.)))) # Keeps only columns with data

Preview the updated dataset

head(geo_data) # Check the structure of the dataset str(geo_data) # Remove columns with all NA values or empty names geo_data <- geo_data %>% select(where(~ !all(is.na(.)))) # Keeps only columns with at least one non-NA value

Preview the dataset after cleaning

head(geo_data) # Check for missing values in key columns colSums(is.na(geo_data)) # Remove all rows with missing values geo_data_clean <- geo_data %>% drop_na()

Preview the cleaned dataset

head(geo_data_clean)

Check the number of rows after cleaning

nrow(geo_data_clean) # Remove all rows with missing values geo_data_clean <- geo_data %>% drop_na() # Preview the cleaned dataset head(geo_data_clean)

Check the number of rows after cleaning

nrow(geo_data_clean)

Remove all rows with missing values

geo_data_clean <- geo_data %>% drop_na() # Preview the cleaned dataset head(geo_data_clean)

Check the number of rows after cleaning

nrow(geo_data_clean)

Remove rows with missing values in key columns

geo_data_clean <- geo_data %>% filter(!is.na(...7) & !is.na(...8) & !is.na(...13) & !is.na(...14))

Preview the cleaned dataset

head(geo_data_clean)

Check the number of rows after cleaning

nrow(geo_data_clean) # Preview the cleaned dataset head(geo_data_clean)

Check the number of rows after cleaning

nrow(geo_data_clean) # View column names to decide which ones are essential colnames(geo_data) # Filter rows where key columns are not missing geo_data_clean <- geo_data %>% filter(!is.na(…7) & !is.na(…8)) # Replace …7 and …8 with actual column names for latitude and longitude

Preview the cleaned dataset

head(geo_data_clean)

Check the number of rows

nrow(geo_data_clean)

Save the cleaned dataset

write_csv(geo_data_clean, “cleaned_geo_data.csv”) # Plot a histogram of latitude values ggplot(data = geo_data_clean, aes(x = …7)) + # Replace ...7 with the actual column name for latitude geom_histogram(binwidth = 1, fill = “blue”, color = “black”) + labs(title = “Histogram of Latitude Values”, x = “Latitude”, y = “Count”) + theme_minimal() geo_data_clean <- geo_data_clean %>% mutate(latitude = as.numeric(...7)) # Replace ...7 with the correct column name for latitude geo_data_clean %>% filter(!is.na(...7) & is.na(as.numeric(...7))) %>% select(...7) # Replace ...7 with your actual column name for latitude

unique(geo_data_clean$...7)

Replace ...7 with your actual column name

unique(geo_data_clean$...7) geo_data_clean <- geo_data_clean %>% filter(!is.na(as.numeric(...7)))

geo_data_clean %>% filter(is.na(as.numeric(...7))) %>% select(...7) %>% unique() geo_data_clean <- geo_data_clean %>% filter(!is.na(as.numeric(...7))) geo_data_clean <- geo_data_clean %>% mutate(...7 = ifelse(is.na(as.numeric(...7)), NA, as.numeric(...7))) geo_data_clean <- geo_data_clean %>% filter(!is.na(...7)) ggplot(data = geo_data_clean, aes(x = as.numeric(...7))) + geom_histogram(binwidth = 1, fill = “blue”, color = “black”) + labs(title = “Histogram of Latitude Values”, x = “Latitude”, y = “Count”) + theme_minimal() # Ensure the columns are numeric geo_data_clean <- geo_data_clean %>% mutate( Depth = as.numeric(...17), # Replace ...17 with actual column name for depth Start_Date = as.numeric(...13) # Replace ...13 with actual column name for start date )

Remove rows with missing values in key columns

geo_data_clean <- geo_data_clean %>% filter(!is.na(Depth) & !is.na(Start_Date))

Plot scatter plot

ggplot(data = geo_data_clean, aes(x = Start_Date, y = Depth)) + geom_point(color = “blue”, alpha = 0.6) + labs(title = “Depth vs. Start Date”, x = “Start Date”, y = “Depth”) + theme_minimal()

colnames(geo_data_clean) # Inspect column names to identify the correct ones colnames(geo_data_clean)

Replace with correct column names for depth and start date

geo_data_clean <- geo_data_clean %>% mutate( Depth = as.numeric(...17), # Replace ...17 with the actual column name for depth Start_Date = as.numeric(...13) # Replace ...13 with the actual column name for start date )

Remove rows with missing values in key columns

geo_data_clean <- geo_data_clean %>% filter(!is.na(Depth) & !is.na(Start_Date))

Plot scatter plot

ggplot(data = geo_data_clean, aes(x = Start_Date, y = Depth)) + geom_point(color = “blue”, alpha = 0.6) + labs(title = “Depth vs. Start Date”, x = “Start Date”, y = “Depth”) + theme_minimal() unique(geo_data_clean\(`...13`) # Check unique values for column ...13 unique(geo_data_clean\)...17) # Check unique values for column …17 colnames(geo_data_clean) geo_data_clean <- geo_data_clean %>% rename( Start_Date = ...13, # Replace with the correct column index for Start Date Depth = ...17 # Replace with the correct column index for Depth ) colnames(geo_data_clean) geo_data_clean <- geo_data_clean %>% filter(!is.na(Start_Date) & !is.na(Depth)) ggplot(data = geo_data_clean, aes(x = Start_Date, y = Depth)) + geom_point(color = “blue”, alpha = 0.6) + labs(title = “Depth vs. Start Date”, x = “Start Date”, y = “Depth”) + theme_minimal() ggplot(data = geo_data_clean, aes(x = Start_Date, y = Depth)) + geom_point(color = “blue”, alpha = 0.6) + labs(title = “Depth vs. Start Date by Region”, x = “Start Date (Years)”, y = “Depth (Meters)”) + theme_minimal() + facet_wrap(~ Region)

colnames(geo_data_clean) ggplot(data = geo_data_clean, aes(x = Start_Date, y = Depth)) + geom_point(color = “blue”, alpha = 0.6) + labs(title = “Depth vs. Start Date by Region”, x = “Start Date (Years)”, y = “Depth (Meters)”) + theme_minimal() + facet_wrap(~ Region) # Replace Region with the correct column name ggplot(data = geo_data_clean, aes(x = Start_Date, y = Depth)) + geom_point(color = “blue”, alpha = 0.6) + labs(title = “Depth vs. Start Date by Region”, x = “Start Date (Years)”, y = “Depth (Meters)”) + theme_minimal() + facet_wrap(~ ...48) # Replace ...48 with the actual column name colnames(geo_data_clean) geo_data_clean <- geo_data_clean %>% rename(Region = ...48) # Replace ...48 with the correct column name unique(geo_data_clean$Region) ggplot(data = geo_data_clean, aes(x = Start_Date, y = Depth)) + geom_point(color = “blue”, alpha = 0.6) + labs(title = “Depth vs. Start Date by Region”, x = “Start Date (Years)”, y = “Depth (Meters)”) + theme_minimal() + facet_wrap(~ Region) geo_data_clean <- geo_data_clean %>% filter(!is.na(Region)) facet_wrap(~ Region, nrow = 2) # Example for 2 rows theme(axis.text.x = element_text(angle = 45, hjust = 1)) geom_point(aes(color = Region), alpha = 0.6) # Save the cleaned data to a CSV file in the working directory write.csv(geo_data_clean, “cleaned_geo_data.csv”, row.names = FALSE)