setwd(“C:/Users/HomePC/OneDrive/Desktop/R project”) # Install necessary libraries (if not already installed) install.packages(“tidyverse”) # For data manipulation and visualization install.packages(“readxl”) # For reading Excel files # Load the required libraries library(tidyverse) library(readxl)
excel_sheets(“Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release.xlsx”) # Load a specific sheet from the Excel file shipwreck_data <- read_excel(“Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release.xlsx”, sheet = 1)
head(shipwreck_data)
list.files() # Load the Excel file using the correct name shipwreck_data <- read_excel(“Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”)
head(shipwreck_data)
excel_sheets(“Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”) # Load the main dataset from the “GeoDatabase” sheet shipwreck_data <- read_excel(“Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”)
head(shipwreck_data) # Load the GeoDatabase sheet into R geo_data <- read_excel(“Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”)
head(geo_data) # Clean column names for better usability library(janitor) geo_data <- geo_data %>% clean_names()
head(geo_data)
geo_data <- geo_data %>% clean_names() # Converts names to lowercase and replaces spaces/special characters with underscores
colnames(geo_data) # Preview the first few rows of the cleaned data head(geo_data) # Remove columns with empty or missing names geo_data <- geo_data %>% select(where(~ !all(is.na(.))))
head(geo_data) # Reload the dataset, skipping the first row if necessary geo_data <- read_excel( “Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”, skip = 1 # Adjust this number based on where the actual data starts )
library(janitor) geo_data <- geo_data %>% clean_names()
colnames(geo_data) head(geo_data) # Remove columns with all NA values or irrelevant names geo_data <- geo_data %>% select(where(~ !all(is.na(.)))) # Keeps columns with at least one non-NA value
head(geo_data) # Reload the data, skipping more rows geo_data <- read_excel( “Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”, skip = 2 # Skip the first two rows )
geo_data <- geo_data %>% clean_names()
colnames(geo_data) # Remove columns with all NA values or irrelevant names geo_data <- geo_data %>% select(where(~ !all(is.na(.)))) # Keeps columns with at least one non-NA value
head(geo_data) # Reload the dataset and skip additional rows geo_data <- read_excel( “Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”, skip = 3 # Adjust to find the row where actual data starts )
geo_data <- geo_data %>% clean_names()
head(geo_data) # Remove irrelevant or empty columns geo_data <- geo_data %>% select(where(~ !all(is.na(.)) & !grepl(“^\.\.\.”, colnames(geo_data))))
head(geo_data) # Rename columns manually if necessary geo_data <- geo_data %>% rename( name = “name”, # Replace with correct column names latitude = “lat”, # Replace as needed longitude = “long” # Replace as needed ) # Display all column names colnames(geo_data)
geo_data <- read_excel( “Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”, skip = 4 # Adjust the number of rows skipped if necessary )
head(geo_data) # Load the data without headers to inspect its structure geo_data_raw <- read_excel( “Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”, col_names = FALSE # Prevent R from using the first row as headers )
head(geo_data_raw) # Reload the dataset, skipping the rows we identified as extra geo_data <- read_excel( “Geodatabase of Shipwrecks OXRep-DARMC-MAPS 07-2020-for-release (1).xlsx”, sheet = “GeoDatabase”, skip = 2 # Adjust this based on where the actual data starts )
library(janitor) geo_data <- geo_data %>% clean_names()
head(geo_data) # Remove empty or irrelevant columns geo_data <- geo_data %>% select(where(~ !all(is.na(.)))) # Keeps only columns with data
head(geo_data) # Check the structure of the dataset str(geo_data) # Remove columns with all NA values or empty names geo_data <- geo_data %>% select(where(~ !all(is.na(.)))) # Keeps only columns with at least one non-NA value
head(geo_data) # Check for missing values in key columns colSums(is.na(geo_data)) # Remove all rows with missing values geo_data_clean <- geo_data %>% drop_na()
head(geo_data_clean)
nrow(geo_data_clean) # Remove all rows with missing values geo_data_clean <- geo_data %>% drop_na() # Preview the cleaned dataset head(geo_data_clean)
nrow(geo_data_clean)
geo_data_clean <- geo_data %>% drop_na() # Preview the cleaned dataset head(geo_data_clean)
nrow(geo_data_clean)
geo_data_clean <- geo_data %>% filter(!is.na(...7)
& !is.na(...8) & !is.na(...13) &
!is.na(...14))
head(geo_data_clean)
nrow(geo_data_clean) # Preview the cleaned dataset head(geo_data_clean)
nrow(geo_data_clean) # View column names to decide which ones are essential colnames(geo_data) # Filter rows where key columns are not missing geo_data_clean <- geo_data %>% filter(!is.na(…7) & !is.na(…8)) # Replace …7 and …8 with actual column names for latitude and longitude
head(geo_data_clean)
nrow(geo_data_clean)
write_csv(geo_data_clean, “cleaned_geo_data.csv”) # Plot a histogram
of latitude values ggplot(data = geo_data_clean, aes(x = …7)) + #
Replace ...7 with the actual column name for latitude
geom_histogram(binwidth = 1, fill = “blue”, color = “black”) +
labs(title = “Histogram of Latitude Values”, x = “Latitude”, y =
“Count”) + theme_minimal() geo_data_clean <- geo_data_clean %>%
mutate(latitude = as.numeric(...7)) # Replace
...7 with the correct column name for latitude
geo_data_clean %>% filter(!is.na(...7) &
is.na(as.numeric(...7))) %>% select(...7) #
Replace ...7 with your actual column name for latitude
unique(geo_data_clean$...7)
...7 with your actual column nameunique(geo_data_clean$...7) geo_data_clean <-
geo_data_clean %>% filter(!is.na(as.numeric(...7)))
geo_data_clean %>% filter(is.na(as.numeric(...7)))
%>% select(...7) %>% unique() geo_data_clean <-
geo_data_clean %>% filter(!is.na(as.numeric(...7)))
geo_data_clean <- geo_data_clean %>% mutate(...7 =
ifelse(is.na(as.numeric(...7)), NA,
as.numeric(...7))) geo_data_clean <- geo_data_clean
%>% filter(!is.na(...7)) ggplot(data = geo_data_clean,
aes(x = as.numeric(...7))) + geom_histogram(binwidth = 1,
fill = “blue”, color = “black”) + labs(title = “Histogram of Latitude
Values”, x = “Latitude”, y = “Count”) + theme_minimal() # Ensure the
columns are numeric geo_data_clean <- geo_data_clean %>% mutate(
Depth = as.numeric(...17), # Replace ...17
with actual column name for depth Start_Date =
as.numeric(...13) # Replace ...13 with actual
column name for start date )
geo_data_clean <- geo_data_clean %>% filter(!is.na(Depth) & !is.na(Start_Date))
ggplot(data = geo_data_clean, aes(x = Start_Date, y = Depth)) + geom_point(color = “blue”, alpha = 0.6) + labs(title = “Depth vs. Start Date”, x = “Start Date”, y = “Depth”) + theme_minimal()
colnames(geo_data_clean) # Inspect column names to identify the correct ones colnames(geo_data_clean)
geo_data_clean <- geo_data_clean %>% mutate( Depth =
as.numeric(...17), # Replace ...17 with the
actual column name for depth Start_Date = as.numeric(...13)
# Replace ...13 with the actual column name for start date
)
geo_data_clean <- geo_data_clean %>% filter(!is.na(Depth) & !is.na(Start_Date))
ggplot(data = geo_data_clean, aes(x = Start_Date, y = Depth)) +
geom_point(color = “blue”, alpha = 0.6) + labs(title = “Depth vs. Start
Date”, x = “Start Date”, y = “Depth”) + theme_minimal()
unique(geo_data_clean\(`...13`) # Check
unique values for column ...13
unique(geo_data_clean\)...17) # Check unique values
for column …17 colnames(geo_data_clean) geo_data_clean <-
geo_data_clean %>% rename( Start_Date = ...13, # Replace
with the correct column index for Start Date Depth = ...17
# Replace with the correct column index for Depth )
colnames(geo_data_clean) geo_data_clean <- geo_data_clean %>%
filter(!is.na(Start_Date) & !is.na(Depth)) ggplot(data =
geo_data_clean, aes(x = Start_Date, y = Depth)) + geom_point(color =
“blue”, alpha = 0.6) + labs(title = “Depth vs. Start Date”, x = “Start
Date”, y = “Depth”) + theme_minimal() ggplot(data = geo_data_clean,
aes(x = Start_Date, y = Depth)) + geom_point(color = “blue”, alpha =
0.6) + labs(title = “Depth vs. Start Date by Region”, x = “Start Date
(Years)”, y = “Depth (Meters)”) + theme_minimal() + facet_wrap(~
Region)
colnames(geo_data_clean) ggplot(data = geo_data_clean, aes(x =
Start_Date, y = Depth)) + geom_point(color = “blue”, alpha = 0.6) +
labs(title = “Depth vs. Start Date by Region”, x = “Start Date (Years)”,
y = “Depth (Meters)”) + theme_minimal() + facet_wrap(~ Region) # Replace
Region with the correct column name ggplot(data =
geo_data_clean, aes(x = Start_Date, y = Depth)) + geom_point(color =
“blue”, alpha = 0.6) + labs(title = “Depth vs. Start Date by Region”, x
= “Start Date (Years)”, y = “Depth (Meters)”) + theme_minimal() +
facet_wrap(~ ...48) # Replace ...48 with the
actual column name colnames(geo_data_clean) geo_data_clean <-
geo_data_clean %>% rename(Region = ...48) # Replace
...48 with the correct column name
unique(geo_data_clean$Region) ggplot(data = geo_data_clean, aes(x =
Start_Date, y = Depth)) + geom_point(color = “blue”, alpha = 0.6) +
labs(title = “Depth vs. Start Date by Region”, x = “Start Date (Years)”,
y = “Depth (Meters)”) + theme_minimal() + facet_wrap(~ Region)
geo_data_clean <- geo_data_clean %>% filter(!is.na(Region))
facet_wrap(~ Region, nrow = 2) # Example for 2 rows theme(axis.text.x =
element_text(angle = 45, hjust = 1)) geom_point(aes(color = Region),
alpha = 0.6) # Save the cleaned data to a CSV file in the working
directory write.csv(geo_data_clean, “cleaned_geo_data.csv”, row.names =
FALSE)