# Import required libraries
library(lubridate) # Parse dates
library(janitor) # Clean data
library(weathermetrics) # Manage weather metrics
library(plyr) # MUST be loaded before dpylr
library(dplyr) # Wrangle data
library(readr) # CSV reader

Import CSV file

# Import raw CSV files
df_gsod <- read_delim('/Users/todddequincey/Google Drive/Education & Learning/Data Science/USQ/Semester 2 2018/CSC8002 Big Data Management/Project/Datasets/uncleaned_datasets/gsod_preprocessed.csv', delim=",")

General Data Cleaning

Check data types and data formatting

Review the heads of the file to ensure imported data types appear correct

# Review df head
head(df_gsod)

Results
No issues noted.
All formats appear reasonable.

Check for duplicate values
# Check for duplicate weather stations, year and month in GSOD
get_dupes(df_gsod, stn_id, month, year)
No duplicate combinations found of: stn_id, month, year

Results
No duplicates found

Remove any empty rows and columns
# Remove rows and columns with totally empty values 
remove_empty(df_gsod, 'rows')
remove_empty(df_gsod, 'cols')

Results
No empty rows or columns found. Therefore no changes to the df

Check for missing values
# Check for missing values / nulls on key 
anyNA(df_gsod)
[1] FALSE

Results
anyNA returned FALSE, therefore no missing values requiring further investigation.

Convert temperatures from fahrenheit to Celcius
# Convert temp, min and max values to celcius
df_gsod$temp <- round(fahrenheit.to.celsius(df_gsod$temp),1)
df_gsod$min <- round(fahrenheit.to.celsius(df_gsod$min),1)
df_gsod$max <- round(fahrenheit.to.celsius(df_gsod$max),1)
Check for outlying values beyond known global max and min temps

Hottest temperature recorded on earth is 56.7C
Lowest temperature recorded is -89.2C

# Filter the data to those with a min or max outside of the global records
min_max_outliers <- filter(df_gsod, max > 57 | min < -90)
min_max_outliers

Results
5721 rows with outlier values in min or max column identified.
Appears that the min and max temperatures are not reliable.
Need to check if the key target variable, temp, has the same issue

# Checking average temp value within expected min and max range
temp_outliers <- nrow(filter(df_gsod, temp > 57 | temp < -90))

0 rows with outlier values in temp column identified.
Temp appears to be reliable at this stage.
Investigation as to what is causing the min and max temps to be incorrect required.

# Import the daily data for 1960 to investigate min and max outlier values
df_1960_raw <- read.csv('/Users/todddequincey/Google Drive/Education & Learning/Data Science/USQ/Semester 2 2018/CSC8002 Big Data Management/Project/Datasets/original_datasets/Temperature/GSOD/daily_summaries/1960.csv')
# Select std_id 111460 from 1960 in Jan which in the DF has a recorded min of 189.2 and max of 193.08
df_111460 <- df_1960_raw %>% 
  select(stn_id, yearmoda, temp, min, max) %>%
  filter(stn_id == 111460)
# Parse dates and set month
dates <- parse_date_time(df_111460$yearmoda, 'Ymd')
df_111460$month <- month(dates)
# Filter to January and review the results
filter(df_111460, month == 1)

Results
Appears that where a value has not been recorded for min or max, 9999.9 is recorded in its place, resulting in the average min and the max for the month being distorted.
Conclude to drop min and max for now. If needed later in analysis, return to pre-processing steps and clean up and walk through process again This does not appear to be an issue for temp, as when a recording for a day is not made, then there is not data recorded at all. This is confirmed below

# Drop min and max from the dataframe
df_gsod <- select(df_gsod, -min, -max)
# Check if any temps (average temp) is greater than 100C (200F), or lower than 89C (192F)
temp_outliers_1960 <- nrow(filter(df_1960_raw, temp > 200 | temp < -192))
temp_outliers_1960
[1] 0

Results
0 errors identifed with temp in the raw 1960’s data.
Conclude that temp appears reliable.

Write dataset out to CSV

Write the final cleaned dataset out to CSV for import into MySQL

write_csv(df_gsod, '/Users/todddequincey/Google Drive/Education & Learning/Data Science/USQ/Semester 2 2018/CSC8002 Big Data Management/Project/Datasets/cleaned_datasets/gsod_cleaned.csv')
LS0tCnRpdGxlOiAiR1NPRCBEYXRhIENsZWFuaW5nIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgoKYGBge3J9CiMgSW1wb3J0IHJlcXVpcmVkIGxpYnJhcmllcwpsaWJyYXJ5KGx1YnJpZGF0ZSkgIyBQYXJzZSBkYXRlcwpsaWJyYXJ5KGphbml0b3IpICMgQ2xlYW4gZGF0YQpsaWJyYXJ5KHdlYXRoZXJtZXRyaWNzKSAjIE1hbmFnZSB3ZWF0aGVyIG1ldHJpY3MKbGlicmFyeShwbHlyKSAjIE1VU1QgYmUgbG9hZGVkIGJlZm9yZSBkcHlscgpsaWJyYXJ5KGRwbHlyKSAjIFdyYW5nbGUgZGF0YQpsaWJyYXJ5KHJlYWRyKSAjIENTViByZWFkZXIKYGBgCgojIyBJbXBvcnQgQ1NWIGZpbGUKYGBge3J9CiMgSW1wb3J0IHJhdyBDU1YgZmlsZXMKZGZfZ3NvZCA8LSByZWFkX2RlbGltKCcvVXNlcnMvdG9kZGRlcXVpbmNleS9Hb29nbGUgRHJpdmUvRWR1Y2F0aW9uICYgTGVhcm5pbmcvRGF0YSBTY2llbmNlL1VTUS9TZW1lc3RlciAyIDIwMTgvQ1NDODAwMiBCaWcgRGF0YSBNYW5hZ2VtZW50L1Byb2plY3QvRGF0YXNldHMvdW5jbGVhbmVkX2RhdGFzZXRzL2dzb2RfcHJlcHJvY2Vzc2VkLmNzdicsIGRlbGltPSIsIikKYGBgCgojIyBHZW5lcmFsIERhdGEgQ2xlYW5pbmcgIAojIyMjIyBDaGVjayBkYXRhIHR5cGVzIGFuZCBkYXRhIGZvcm1hdHRpbmcKUmV2aWV3IHRoZSBoZWFkcyBvZiB0aGUgZmlsZSB0byBlbnN1cmUgaW1wb3J0ZWQgZGF0YSB0eXBlcyBhcHBlYXIgY29ycmVjdApgYGB7cn0KIyBSZXZpZXcgZGYgaGVhZApoZWFkKGRmX2dzb2QpCmBgYAoqKlJlc3VsdHMqKiAgCk5vIGlzc3VlcyBub3RlZC4gIApBbGwgZm9ybWF0cyBhcHBlYXIgcmVhc29uYWJsZS4gIAoKIyMjIyMgQ2hlY2sgZm9yIGR1cGxpY2F0ZSB2YWx1ZXMKYGBge3J9CiMgQ2hlY2sgZm9yIGR1cGxpY2F0ZSB3ZWF0aGVyIHN0YXRpb25zLCB5ZWFyIGFuZCBtb250aCBpbiBHU09ECmdldF9kdXBlcyhkZl9nc29kLCBzdG5faWQsIG1vbnRoLCB5ZWFyKQpgYGAKKipSZXN1bHRzKiogIApObyBkdXBsaWNhdGVzIGZvdW5kCgoKIyMjIyMgUmVtb3ZlIGFueSBlbXB0eSByb3dzIGFuZCBjb2x1bW5zCmBgYHtyfQojIFJlbW92ZSByb3dzIGFuZCBjb2x1bW5zIHdpdGggdG90YWxseSBlbXB0eSB2YWx1ZXMgCnJlbW92ZV9lbXB0eShkZl9nc29kLCAncm93cycpCnJlbW92ZV9lbXB0eShkZl9nc29kLCAnY29scycpCmBgYAoqKlJlc3VsdHMqKiAgCk5vIGVtcHR5IHJvd3Mgb3IgY29sdW1ucyBmb3VuZC4gVGhlcmVmb3JlIG5vIGNoYW5nZXMgdG8gdGhlIGRmCgoKIyMjIyMgQ2hlY2sgZm9yIG1pc3NpbmcgdmFsdWVzCmBgYHtyfQojIENoZWNrIGZvciBtaXNzaW5nIHZhbHVlcyAvIG51bGxzIG9uIGtleSAKYW55TkEoZGZfZ3NvZCkKYGBgCioqUmVzdWx0cyoqICAKYW55TkEgcmV0dXJuZWQgRkFMU0UsIHRoZXJlZm9yZSBubyBtaXNzaW5nIHZhbHVlcyByZXF1aXJpbmcgZnVydGhlciBpbnZlc3RpZ2F0aW9uLgoKCiMjIyMjIENvbnZlcnQgdGVtcGVyYXR1cmVzIGZyb20gZmFocmVuaGVpdCB0byBDZWxjaXVzCmBgYHtyfQojIENvbnZlcnQgdGVtcCwgbWluIGFuZCBtYXggdmFsdWVzIHRvIGNlbGNpdXMKZGZfZ3NvZCR0ZW1wIDwtIHJvdW5kKGZhaHJlbmhlaXQudG8uY2Vsc2l1cyhkZl9nc29kJHRlbXApLDEpCmRmX2dzb2QkbWluIDwtIHJvdW5kKGZhaHJlbmhlaXQudG8uY2Vsc2l1cyhkZl9nc29kJG1pbiksMSkKZGZfZ3NvZCRtYXggPC0gcm91bmQoZmFocmVuaGVpdC50by5jZWxzaXVzKGRmX2dzb2QkbWF4KSwxKQpgYGAKCiMjIyMjIENoZWNrIGZvciBvdXRseWluZyB2YWx1ZXMgYmV5b25kIGtub3duIGdsb2JhbCBtYXggYW5kIG1pbiB0ZW1wcwpbSG90dGVzdCB0ZW1wZXJhdHVyZV0oaHR0cHM6Ly9lbi53aWtpcGVkaWEub3JnL3dpa2kvSGlnaGVzdF90ZW1wZXJhdHVyZV9yZWNvcmRlZF9vbl9FYXJ0aCkgcmVjb3JkZWQgb24gZWFydGggaXMgNTYuN0MgIApbTG93ZXN0IHRlbXBlcmF0dXJlXShodHRwczovL2VuLndpa2lwZWRpYS5vcmcvd2lraS9Mb3dlc3RfdGVtcGVyYXR1cmVfcmVjb3JkZWRfb25fRWFydGgpIHJlY29yZGVkIGlzIC04OS4yQwpgYGB7cn0KIyBGaWx0ZXIgdGhlIGRhdGEgdG8gdGhvc2Ugd2l0aCBhIG1pbiBvciBtYXggb3V0c2lkZSBvZiB0aGUgZ2xvYmFsIHJlY29yZHMKbWluX21heF9vdXRsaWVycyA8LSBmaWx0ZXIoZGZfZ3NvZCwgbWF4ID4gNTcgfCBtaW4gPCAtOTApCm1pbl9tYXhfb3V0bGllcnMKYGBgCgoqKlJlc3VsdHMqKiAgCmByIG5yb3cobWluX21heF9vdXRsaWVycylgIHJvd3Mgd2l0aCBvdXRsaWVyIHZhbHVlcyBpbiBtaW4gb3IgbWF4IGNvbHVtbiBpZGVudGlmaWVkLiAgCkFwcGVhcnMgdGhhdCB0aGUgbWluIGFuZCBtYXggdGVtcGVyYXR1cmVzIGFyZSBub3QgcmVsaWFibGUuICAKTmVlZCB0byBjaGVjayBpZiB0aGUga2V5IHRhcmdldCB2YXJpYWJsZSwgdGVtcCwgaGFzIHRoZSBzYW1lIGlzc3VlCgoKYGBge3J9CiMgQ2hlY2tpbmcgYXZlcmFnZSB0ZW1wIHZhbHVlIHdpdGhpbiBleHBlY3RlZCBtaW4gYW5kIG1heCByYW5nZQp0ZW1wX291dGxpZXJzIDwtIG5yb3coZmlsdGVyKGRmX2dzb2QsIHRlbXAgPiA1NyB8IHRlbXAgPCAtOTApKQpgYGAKYHIgYXMuaW50ZWdlcih0ZW1wX291dGxpZXJzKWAgcm93cyB3aXRoIG91dGxpZXIgdmFsdWVzIGluIHRlbXAgY29sdW1uIGlkZW50aWZpZWQuICAKVGVtcCBhcHBlYXJzIHRvIGJlIHJlbGlhYmxlIGF0IHRoaXMgc3RhZ2UuICAKSW52ZXN0aWdhdGlvbiBhcyB0byB3aGF0IGlzIGNhdXNpbmcgdGhlIG1pbiBhbmQgbWF4IHRlbXBzIHRvIGJlIGluY29ycmVjdCByZXF1aXJlZC4KCgpgYGB7cn0KIyBJbXBvcnQgdGhlIGRhaWx5IGRhdGEgZm9yIDE5NjAgdG8gaW52ZXN0aWdhdGUgbWluIGFuZCBtYXggb3V0bGllciB2YWx1ZXMKZGZfMTk2MF9yYXcgPC0gcmVhZC5jc3YoJy9Vc2Vycy90b2RkZGVxdWluY2V5L0dvb2dsZSBEcml2ZS9FZHVjYXRpb24gJiBMZWFybmluZy9EYXRhIFNjaWVuY2UvVVNRL1NlbWVzdGVyIDIgMjAxOC9DU0M4MDAyIEJpZyBEYXRhIE1hbmFnZW1lbnQvUHJvamVjdC9EYXRhc2V0cy9vcmlnaW5hbF9kYXRhc2V0cy9UZW1wZXJhdHVyZS9HU09EL2RhaWx5X3N1bW1hcmllcy8xOTYwLmNzdicpCmBgYAoKCmBgYHtyfQojIFNlbGVjdCBzdGRfaWQgMTExNDYwIGZyb20gMTk2MCBpbiBKYW4gd2hpY2ggaW4gdGhlIERGIGhhcyBhIHJlY29yZGVkIG1pbiBvZiAxODkuMiBhbmQgbWF4IG9mIDE5My4wOApkZl8xMTE0NjAgPC0gZGZfMTk2MF9yYXcgJT4lIAogIHNlbGVjdChzdG5faWQsIHllYXJtb2RhLCB0ZW1wLCBtaW4sIG1heCkgJT4lCiAgZmlsdGVyKHN0bl9pZCA9PSAxMTE0NjApCgojIFBhcnNlIGRhdGVzIGFuZCBzZXQgbW9udGgKZGF0ZXMgPC0gcGFyc2VfZGF0ZV90aW1lKGRmXzExMTQ2MCR5ZWFybW9kYSwgJ1ltZCcpCmRmXzExMTQ2MCRtb250aCA8LSBtb250aChkYXRlcykKCiMgRmlsdGVyIHRvIEphbnVhcnkgYW5kIHJldmlldyB0aGUgcmVzdWx0cwpmaWx0ZXIoZGZfMTExNDYwLCBtb250aCA9PSAxKQpgYGAKCioqUmVzdWx0cyoqICAKQXBwZWFycyB0aGF0IHdoZXJlIGEgdmFsdWUgaGFzIG5vdCBiZWVuIHJlY29yZGVkIGZvciBtaW4gb3IgbWF4LCA5OTk5LjkgaXMgcmVjb3JkZWQgaW4gaXRzIHBsYWNlLCByZXN1bHRpbmcgaW4gdGhlIGF2ZXJhZ2UgbWluIGFuZCB0aGUgbWF4IGZvciB0aGUgbW9udGggYmVpbmcgZGlzdG9ydGVkLiAgCkNvbmNsdWRlIHRvIGRyb3AgbWluIGFuZCBtYXggZm9yIG5vdy4gSWYgbmVlZGVkIGxhdGVyIGluIGFuYWx5c2lzLCByZXR1cm4gdG8gcHJlLXByb2Nlc3Npbmcgc3RlcHMgYW5kIGNsZWFuIHVwIGFuZCB3YWxrIHRocm91Z2ggcHJvY2VzcyBhZ2FpbgpUaGlzIGRvZXMgbm90IGFwcGVhciB0byBiZSBhbiBpc3N1ZSBmb3IgdGVtcCwgYXMgd2hlbiBhIHJlY29yZGluZyBmb3IgYSBkYXkgaXMgbm90IG1hZGUsIHRoZW4gdGhlcmUgaXMgbm90IGRhdGEgcmVjb3JkZWQgYXQgYWxsLiBUaGlzIGlzIGNvbmZpcm1lZCBiZWxvdwoKYGBge3J9CiMgRHJvcCBtaW4gYW5kIG1heCBmcm9tIHRoZSBkYXRhZnJhbWUKZGZfZ3NvZCA8LSBzZWxlY3QoZGZfZ3NvZCwgLW1pbiwgLW1heCkKYGBgCgpgYGB7cn0KIyBDaGVjayBpZiBhbnkgdGVtcHMgKGF2ZXJhZ2UgdGVtcCkgaXMgZ3JlYXRlciB0aGFuIDEwMEMgKDIwMEYpLCBvciBsb3dlciB0aGFuIDg5QyAoMTkyRikKdGVtcF9vdXRsaWVyc18xOTYwIDwtIG5yb3coZmlsdGVyKGRmXzE5NjBfcmF3LCB0ZW1wID4gMjAwIHwgdGVtcCA8IC0xOTIpKQp0ZW1wX291dGxpZXJzXzE5NjAKYGBgCioqUmVzdWx0cyoqICAKYHIgYXMuaW50ZWdlcih0ZW1wX291dGxpZXJzXzE5NjApYCBlcnJvcnMgaWRlbnRpZmVkIHdpdGggdGVtcCBpbiB0aGUgcmF3IDE5NjAncyBkYXRhLiAgCkNvbmNsdWRlIHRoYXQgdGVtcCBhcHBlYXJzIHJlbGlhYmxlLgoKCiMjIyBXcml0ZSBkYXRhc2V0IG91dCB0byBDU1YKV3JpdGUgdGhlIGZpbmFsIGNsZWFuZWQgZGF0YXNldCBvdXQgdG8gQ1NWIGZvciBpbXBvcnQgaW50byBNeVNRTApgYGB7cn0Kd3JpdGVfY3N2KGRmX2dzb2QsICcvVXNlcnMvdG9kZGRlcXVpbmNleS9Hb29nbGUgRHJpdmUvRWR1Y2F0aW9uICYgTGVhcm5pbmcvRGF0YSBTY2llbmNlL1VTUS9TZW1lc3RlciAyIDIwMTgvQ1NDODAwMiBCaWcgRGF0YSBNYW5hZ2VtZW50L1Byb2plY3QvRGF0YXNldHMvY2xlYW5lZF9kYXRhc2V0cy9nc29kX2NsZWFuZWQuY3N2JykKYGBgCg==