# Import required libraries
library(lubridate) # Parse dates
library(janitor) # Clean data
library(weathermetrics) # Manage weather metrics
library(plyr) # MUST be loaded before dpylr
library(dplyr) # Wrangle data
library(readr) # CSV reader
# Import raw CSV files
df_gsod <- read_delim('/Users/todddequincey/Google Drive/Education & Learning/Data Science/USQ/Semester 2 2018/CSC8002 Big Data Management/Project/Datasets/uncleaned_datasets/gsod_preprocessed.csv', delim=",")
Review the heads of the file to ensure imported data types appear correct
# Review df head
head(df_gsod)
Results
No issues noted.
All formats appear reasonable.
# Check for duplicate weather stations, year and month in GSOD
get_dupes(df_gsod, stn_id, month, year)
No duplicate combinations found of: stn_id, month, year
Results
No duplicates found
# Remove rows and columns with totally empty values
remove_empty(df_gsod, 'rows')
remove_empty(df_gsod, 'cols')
Results
No empty rows or columns found. Therefore no changes to the df
# Check for missing values / nulls on key
anyNA(df_gsod)
[1] FALSE
Results
anyNA returned FALSE, therefore no missing values requiring further investigation.
# Convert temp, min and max values to celcius
df_gsod$temp <- round(fahrenheit.to.celsius(df_gsod$temp),1)
df_gsod$min <- round(fahrenheit.to.celsius(df_gsod$min),1)
df_gsod$max <- round(fahrenheit.to.celsius(df_gsod$max),1)
Hottest temperature recorded on earth is 56.7C
Lowest temperature recorded is -89.2C
# Filter the data to those with a min or max outside of the global records
min_max_outliers <- filter(df_gsod, max > 57 | min < -90)
min_max_outliers
Results
5721 rows with outlier values in min or max column identified.
Appears that the min and max temperatures are not reliable.
Need to check if the key target variable, temp, has the same issue
# Checking average temp value within expected min and max range
temp_outliers <- nrow(filter(df_gsod, temp > 57 | temp < -90))
0 rows with outlier values in temp column identified.
Temp appears to be reliable at this stage.
Investigation as to what is causing the min and max temps to be incorrect required.
# Import the daily data for 1960 to investigate min and max outlier values
df_1960_raw <- read.csv('/Users/todddequincey/Google Drive/Education & Learning/Data Science/USQ/Semester 2 2018/CSC8002 Big Data Management/Project/Datasets/original_datasets/Temperature/GSOD/daily_summaries/1960.csv')
# Select std_id 111460 from 1960 in Jan which in the DF has a recorded min of 189.2 and max of 193.08
df_111460 <- df_1960_raw %>%
select(stn_id, yearmoda, temp, min, max) %>%
filter(stn_id == 111460)
# Parse dates and set month
dates <- parse_date_time(df_111460$yearmoda, 'Ymd')
df_111460$month <- month(dates)
# Filter to January and review the results
filter(df_111460, month == 1)
Results
Appears that where a value has not been recorded for min or max, 9999.9 is recorded in its place, resulting in the average min and the max for the month being distorted.
Conclude to drop min and max for now. If needed later in analysis, return to pre-processing steps and clean up and walk through process again This does not appear to be an issue for temp, as when a recording for a day is not made, then there is not data recorded at all. This is confirmed below
# Drop min and max from the dataframe
df_gsod <- select(df_gsod, -min, -max)
# Check if any temps (average temp) is greater than 100C (200F), or lower than 89C (192F)
temp_outliers_1960 <- nrow(filter(df_1960_raw, temp > 200 | temp < -192))
temp_outliers_1960
[1] 0
Results
0 errors identifed with temp in the raw 1960’s data.
Conclude that temp appears reliable.
Write the final cleaned dataset out to CSV for import into MySQL
write_csv(df_gsod, '/Users/todddequincey/Google Drive/Education & Learning/Data Science/USQ/Semester 2 2018/CSC8002 Big Data Management/Project/Datasets/cleaned_datasets/gsod_cleaned.csv')