Required packages

library(dplyr)
library(tidyr)
library(readr)
library(lubridate)
library(outliers)
library(ggplot2)
library(forecast)
Registered S3 method overwritten by 'xts':
  method     from
  as.zoo.xts zoo 
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
Registered S3 methods overwritten by 'forecast':
  method             from    
  fitted.fracdiff    fracdiff
  residuals.fracdiff fracdiff
This is forecast 8.9 
  Want to stay up-to-date? Read the Hyndsight blog:
  https://robjhyndman.com/hyndsight/

Executive Summary

Data

setwd("F:/DataPre/Assignment3")
data1 <- read_csv("Data/Data_rainfall_melbourne.csv")
Parsed with column specification:
cols(
  `Bureau of Meteorology station number` = col_double(),
  Year = col_double(),
  Month = col_double(),
  `Rainfall Amount` = col_double(),
  Quality = col_character(),
  `Report Generation Date` = col_character(),
  Outcome = col_character()
)
data2 <- read_csv("Data/Data_temperature_melbourne.csv")
Parsed with column specification:
cols(
  `Bureau of Meteorology station number` = col_double(),
  Year = col_double(),
  Month = col_double(),
  `Mean maximum temperature` = col_double(),
  Quality = col_character(),
  `Report Generation Date` = col_character(),
  Outcome = col_character()
)

Dataset 1

data1 <- data1 %>% select(-`Bureau of Meteorology station number`)
head(data1)

Dataset 2

data2 <- data2 %>% select(-`Bureau of Meteorology station number`)
head(data2)

Joining Datasets

data <- left_join(data1, data2 , by=c("Year","Month", "Report Generation Date"))
head(data)

Understand

str(data)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':    57 obs. of  9 variables:
 $ Year                    : num  2015 2015 2015 2015 2015 ...
 $ Month                   : num  1 2 3 4 5 6 7 8 9 10 ...
 $ Rainfall Amount         : num  1.832 1.478 0.967 1.038 1.38 ...
 $ Quality.x               : chr  "N" "Y" "Y" "Y" ...
 $ Report Generation Date  : chr  "01/02/2015" "01/03/2015" "01/04/2015" "01/05/2015" ...
 $ Outcome.x               : chr  "High" "High" "Low" "Low" ...
 $ Mean maximum temperature: num  25.9 26.4 22.7 19.2 17.1 NA 13.3 13.8 17.3 NA ...
 $ Quality.y               : chr  "Y" "Y" "Y" "N" ...
 $ Outcome.y               : chr  "High" "High" "High" "Low" ...
data$Outcome.x <-  factor(data$Outcome.x, levels = c("Low","High"), ordered = TRUE)
data$Outcome.y <- factor(data$Outcome.y,levels = c("Low","High"), ordered = TRUE)
data$Quality.x <- factor(data$Quality.x,levels = c("N","Y"))
data$Quality.y <- factor(data$Quality.y,levels = c("N","Y"))
data$`Report Generation Date` <- as.Date(data$`Report Generation Date`, format = "%d/%m/%Y")
summary(data$`Rainfall Amount`)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 0.0410  0.9605  1.5460  1.6048  2.0945  4.7560       6 
summary(data$`Mean maximum temperature`)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  13.30   15.80   20.75   20.40   25.20   27.50       5 
str(data)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':    57 obs. of  9 variables:
 $ Year                    : num  2015 2015 2015 2015 2015 ...
 $ Month                   : num  1 2 3 4 5 6 7 8 9 10 ...
 $ Rainfall Amount         : num  1.832 1.478 0.967 1.038 1.38 ...
 $ Quality.x               : Factor w/ 2 levels "N","Y": 1 2 2 2 2 2 1 1 2 2 ...
 $ Report Generation Date  : Date, format: "2015-02-01" "2015-03-01" "2015-04-01" "2015-05-01" ...
 $ Outcome.x               : Ord.factor w/ 2 levels "Low"<"High": 2 2 1 1 2 1 2 NA 1 1 ...
 $ Mean maximum temperature: num  25.9 26.4 22.7 19.2 17.1 NA 13.3 13.8 17.3 NA ...
 $ Quality.y               : Factor w/ 2 levels "N","Y": 2 2 2 1 2 1 2 2 2 1 ...
 $ Outcome.y               : Ord.factor w/ 2 levels "Low"<"High": 2 2 2 1 1 NA 1 1 1 NA ...

Tidy & Manipulate Data I

str(data)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':    57 obs. of  9 variables:
 $ Year                    : num  2015 2015 2015 2015 2015 ...
 $ Month                   : num  1 2 3 4 5 6 7 8 9 10 ...
 $ Rainfall Amount         : num  1.832 1.478 0.967 1.038 1.38 ...
 $ Quality.x               : Factor w/ 2 levels "N","Y": 1 2 2 2 2 2 1 1 2 2 ...
 $ Report Generation Date  : Date, format: "2015-02-01" "2015-03-01" "2015-04-01" "2015-05-01" ...
 $ Outcome.x               : Ord.factor w/ 2 levels "Low"<"High": 2 2 1 1 2 1 2 NA 1 1 ...
 $ Mean maximum temperature: num  25.9 26.4 22.7 19.2 17.1 NA 13.3 13.8 17.3 NA ...
 $ Quality.y               : Factor w/ 2 levels "N","Y": 2 2 2 1 2 1 2 2 2 1 ...
 $ Outcome.y               : Ord.factor w/ 2 levels "Low"<"High": 2 2 2 1 1 NA 1 1 1 NA ...
head(data)

Tidy & Manipulate Data II

data <- mutate(data,Month=factor(Month,labels = c('Jan','Feb','Mar','April','May','June','July', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec')))
head(data)

Scan I

colSums(is.na(data))
                    Year                    Month          Rainfall Amount                Quality.x   Report Generation Date 
                       0                        0                        6                        0                        0 
               Outcome.x Mean maximum temperature                Quality.y                Outcome.y 
                       6                        5                        0                        5 
sum(is.na(data))
[1] 22
new_data <- na.omit(data) 
colSums(is.na(new_data)) 
                    Year                    Month          Rainfall Amount                Quality.x   Report Generation Date 
                       0                        0                        0                        0                        0 
               Outcome.x Mean maximum temperature                Quality.y                Outcome.y 
                       0                        0                        0                        0 
sum(is.na(new_data))
[1] 0

Scan II

z.scores_rainfall <- new_data$`Rainfall Amount` %>%  scores(type = "z")
length (which( abs(z.scores_rainfall) >3 ))
[1] 1
boxplot(new_data$`Rainfall Amount`, main = "BoxPlot of  Mean Rainfall", ylab = "Rainfall Amount in (mm)" , col = "red" )

z.scores_temperature <- new_data$`Mean maximum temperature` %>%  scores(type = "z")
length (which( abs(z.scores_temperature) >3 ))
[1] 0
boxplot(new_data$`Mean maximum temperature`, main = "BoxPlot of Mean Temperature", ylab = "Mean Temperature in (C)" , col = "blue" )

clean_rainfall <- new_data$`Rainfall Amount`[ - which( abs(z.scores_rainfall) >3 )]
z.scores_rainfall <- clean_rainfall %>%  scores(type = "z")
length (which( abs(z.scores_rainfall) >3 ))
[1] 0

Here we can see we have sucessfully removed all outliers from rainfall data

Transform

hist(new_data$`Rainfall Amount`, main = "Histogram of Temperature with vertical mean line", xlab = " Mean Rainfall in (mm)")
abline(v = mean(new_data$`Rainfall Amount`), col="red", lwd=3, lty=2)

hist(new_data$`Mean maximum temperature`, main = "Histogram of Temperature with vertical mean line", xlab = "Mean Temperature in (C)")
abline(v = mean(new_data$`Mean maximum temperature`), col="red", lwd=3, lty=2)

boxcox_rainfall <- BoxCox(new_data$`Rainfall Amount`, lambda = "auto")
hist(boxcox_rainfall, main="Histogram of Rainfall data after Transformation", xlab = " Mean Rainfall in (mm)" )
abline(v = mean(boxcox_rainfall), col="blue", lwd=3, lty=2)

boxcox_temperature <- BoxCox(new_data$`Mean maximum temperature`, lambda = "auto")
hist(boxcox_temperature, main="Histogram of Temperature data after Transformation", xlab = "Mean Temperature in (C)")
abline(v = mean(boxcox_temperature), col="blue", lwd=3, lty=2)



