Rpubs link:

Required packages

library(readr)
library(tidyr)
library(dplyr)
library(Hmisc)
library(outliers)
library(xlsx)
library(readxl)
library(foreign)
library(gdata)
library(rvest)
library(stringr)
library(knitr)

Executive Summary

Our report includes different data preprocessing techniques, starting from loading all the required packages and the data, The first step of preprocessing is based on understanding of data through which we demonstrated the data dimensions which shows the total rows and columns of the R object.We used structure function(str) which provides the information use to view the variable view tab. The class function has been used to identify the variety of data types including scalars, vectors (numerical, character or logical) it also includes matrices, dataframes, lists etc.By using the as.factor we converted the columns into factor column for the promotional markdowns and the last function we used for understanding was str function which changed the markdown to factor to numeric. The second task was to tidy the data in which we checked the summary of the data and removed all the NA’s in the data and imputation function to replace the mising values, We scanned numeric data for outliers and treated them whch can be seen in the boxplot.The last task was to transformed the attributes in which transformed the attributes and showed the temperature and dept transformation through histograms.

Data

The data sets were sourced from:https://www.kaggle.com/c/walmart-recruiting-store-sales-forecasting/data, there were two datsets used for this project which were the historical sales data of 45 wallmart stores situated in different regions. The dataset contained 15 columns and 421570 collectively, the variables in the train.csv dataset were: Store - the store number Dept - the department number Date - the week Weekly_Sales - sales for the given department in the given store IsHoliday - whether the week is a special holiday week

The variables in the features.csv dataset were: Store - the store number Date - the week Temperature - average temperature in the region Fuel_Price - cost of fuel in the region MarkDown1-5 - anonymized data related to promotional markdowns that Walmart is running. MarkDown data is only available after Nov 2011, and is not available for all stores all the time. Any missing value is marked with an NA. CPI - the consumer price index Unemployment - the unemployment rate IsHoliday - whether the week is a special holida

getwd()
## [1] "/Users/mohammadrazzak/Documents/University/RMIT/data preprocessing/Assignment 3"
setwd("/Users/mohammadrazzak/Documents/University/RMIT/data preprocessing/Assignment 3")

features <- read_csv("features.csv")
## Parsed with column specification:
## cols(
##   Store = col_integer(),
##   Date = col_date(format = ""),
##   Temperature = col_double(),
##   Fuel_Price = col_double(),
##   MarkDown1 = col_double(),
##   MarkDown2 = col_double(),
##   MarkDown3 = col_double(),
##   MarkDown4 = col_double(),
##   MarkDown5 = col_double(),
##   CPI = col_double(),
##   Unemployment = col_double(),
##   IsHoliday = col_logical()
## )
train <- read_csv("train.csv")
## Parsed with column specification:
## cols(
##   Store = col_integer(),
##   Dept = col_integer(),
##   Date = col_date(format = ""),
##   Weekly_Sales = col_double(),
##   IsHoliday = col_logical()
## )
#Merge the datasets
#based on "Store" and "Date"
merge_ds <- merge(features,train,by=c("Store","Date"))

Understand

#describing data
dim(merge_ds)
## [1] 421570     15
#types of all variables
str(merge_ds)
## 'data.frame':    421570 obs. of  15 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Date, format: "2010-02-05" "2010-02-05" ...
##  $ Temperature : num  42.3 42.3 42.3 42.3 42.3 ...
##  $ Fuel_Price  : num  2.57 2.57 2.57 2.57 2.57 ...
##  $ MarkDown1   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown2   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown3   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown4   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown5   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...
##  $ IsHoliday.x : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Dept        : int  1 26 17 45 28 79 55 5 58 7 ...
##  $ Weekly_Sales: num  24924.5 11737.1 13223.8 37.4 1085.3 ...
##  $ IsHoliday.y : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
class(merge_ds$Store)
## [1] "integer"
class(merge_ds$Date)
## [1] "Date"
class(merge_ds$Temperature)
## [1] "numeric"
class(merge_ds$Fuel_Price)
## [1] "numeric"
class(merge_ds$MarkDown1)
## [1] "numeric"
class(merge_ds$MarkDown2)
## [1] "numeric"
class(merge_ds$MarkDown3)
## [1] "numeric"
class(merge_ds$MarkDown4)
## [1] "numeric"
class(merge_ds$MarkDown5)
## [1] "numeric"
class(merge_ds$CPI)
## [1] "numeric"
class(merge_ds$Unemployment)
## [1] "numeric"
class(merge_ds$IsHoliday.x)
## [1] "logical"
class(merge_ds$Dept)
## [1] "integer"
class(merge_ds$Weekly_Sales)
## [1] "numeric"
class(merge_ds$IsHoliday.y)
## [1] "logical"
#convet a column into a factor column
merge_ds$Store<-as.factor(merge_ds$Store)
merge_ds$MarkDown1<-as.factor(merge_ds$MarkDown1)
merge_ds$MarkDown2<-as.factor(merge_ds$MarkDown2)
merge_ds$MarkDown3<-as.factor(merge_ds$MarkDown3)
merge_ds$MarkDown4<-as.factor(merge_ds$MarkDown4)
merge_ds$MarkDown5<-as.factor(merge_ds$MarkDown5)


str(merge_ds)
## 'data.frame':    421570 obs. of  15 variables:
##  $ Store       : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Date, format: "2010-02-05" "2010-02-05" ...
##  $ Temperature : num  42.3 42.3 42.3 42.3 42.3 ...
##  $ Fuel_Price  : num  2.57 2.57 2.57 2.57 2.57 ...
##  $ MarkDown1   : Factor w/ 2277 levels "0.27","0.5","1.5",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown2   : Factor w/ 1499 levels "-265.76","-192",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown3   : Factor w/ 1662 levels "-29.1","-1","-0.87",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown4   : Factor w/ 1944 levels "0.22","0.41",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown5   : Factor w/ 2293 levels "135.16","153.04",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...
##  $ IsHoliday.x : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Dept        : int  1 26 17 45 28 79 55 5 58 7 ...
##  $ Weekly_Sales: num  24924.5 11737.1 13223.8 37.4 1085.3 ...
##  $ IsHoliday.y : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

Tidy & Manipulate Data I

#Determining if data is tidy <>
#Joining "Store" and "Date" to make a unique coloumn in the merged dataset 

merge_ds_unique <- unite(merge_ds, "Store_Date", Store, Date, sep = "_")
#removing duplicate columns)
merge_ds_unique$IsHoliday.y <- NULL

#Creating a varialble "tot_markdown" which is sum of markdown1 to markdown5
merge_ds_unique$tot_markdown <- merge_ds_unique$MarkDown1 + merge_ds_unique$MarkDown2 + merge_ds_unique$MarkDown3 + merge_ds_unique$MarkDown4 + merge_ds_unique$MarkDown5
## Warning in Ops.factor(merge_ds_unique$MarkDown1, merge_ds_unique
## $MarkDown2): '+' not meaningful for factors
## Warning in Ops.factor(merge_ds_unique$MarkDown1 + merge_ds_unique
## $MarkDown2, : '+' not meaningful for factors
## Warning in Ops.factor(merge_ds_unique$MarkDown1 + merge_ds_unique$MarkDown2
## + : '+' not meaningful for factors
## Warning in Ops.factor(merge_ds_unique$MarkDown1 + merge_ds_unique$MarkDown2
## + : '+' not meaningful for factors
#Calculting Markdown percent = (tot_markdown/weekly_sales)*100
merge_ds_unique$Markdown_percent <- (merge_ds_unique$tot_markdown/merge_ds_unique$Weekly_Sales)*100

Tidy & Manipulate Data II

#Checking for inconsistencies and missing values and treating them
summary(merge_ds_unique)
##   Store_Date         Temperature       Fuel_Price      MarkDown1     
##  Length:421570      Min.   : -2.06   Min.   :2.472   1.5    :   102  
##  Class :character   1st Qu.: 46.68   1st Qu.:2.933   460.73 :   102  
##  Mode  :character   Median : 62.09   Median :3.452   175.64 :    93  
##                     Mean   : 60.09   Mean   :3.361   686.24 :    75  
##                     3rd Qu.: 74.28   3rd Qu.:3.738   1282.42:    75  
##                     Max.   :100.14   Max.   :4.468   (Other):150234  
##                                                      NA's   :270889  
##    MarkDown2        MarkDown3        MarkDown4        MarkDown5     
##  1.91   :   539   3      :   754   9      :   280   2743.18:   136  
##  3      :   493   6      :   710   4      :   200   1064.56:   120  
##  0.5    :   485   2      :   660   2      :   197   3557.67:    75  
##  1.5    :   471   1      :   611   3      :   146   3567.03:    75  
##  4      :   367   0.22   :   487   47     :   143   4180.29:    75  
##  (Other):108893   (Other):133869   (Other):134001   (Other):150951  
##  NA's   :310322   NA's   :284479   NA's   :286603   NA's   :270138  
##       CPI         Unemployment    IsHoliday.x          Dept      
##  Min.   :126.1   Min.   : 3.879   Mode :logical   Min.   : 1.00  
##  1st Qu.:132.0   1st Qu.: 6.891   FALSE:391909    1st Qu.:18.00  
##  Median :182.3   Median : 7.866   TRUE :29661     Median :37.00  
##  Mean   :171.2   Mean   : 7.960                   Mean   :44.26  
##  3rd Qu.:212.4   3rd Qu.: 8.572                   3rd Qu.:74.00  
##  Max.   :227.2   Max.   :14.313                   Max.   :99.00  
##                                                                  
##   Weekly_Sales    tot_markdown   Markdown_percent
##  Min.   : -4989   Mode:logical   Min.   : NA     
##  1st Qu.:  2080   NA's:421570    1st Qu.: NA     
##  Median :  7612                  Median : NA     
##  Mean   : 15981                  Mean   :NaN     
##  3rd Qu.: 20206                  3rd Qu.: NA     
##  Max.   :693099                  Max.   : NA     
##                                  NA's   :421570
sapply(merge_ds_unique, function(x) sum(is.na(x)))
##       Store_Date      Temperature       Fuel_Price        MarkDown1 
##                0                0                0           270889 
##        MarkDown2        MarkDown3        MarkDown4        MarkDown5 
##           310322           284479           286603           270138 
##              CPI     Unemployment      IsHoliday.x             Dept 
##                0                0                0                0 
##     Weekly_Sales     tot_markdown Markdown_percent 
##                0           421570           421570

Scan I

#EXPLANATION
merge_ds_unique$MarkDown1<-impute(merge_ds_unique$MarkDown1, fun = mode)
merge_ds_unique$MarkDown2<-impute(merge_ds_unique$MarkDown2, fun = mode)
merge_ds_unique$MarkDown3<-impute(merge_ds_unique$MarkDown3, fun = mode)
merge_ds_unique$MarkDown4<-impute(merge_ds_unique$MarkDown4, fun = mode)
merge_ds_unique$MarkDown5<-impute(merge_ds_unique$MarkDown5, fun = mode)

merge_ds_unique$tot_markdown<-impute(merge_ds_unique$tot_markdown, fun = mode)
merge_ds_unique$Markdown_percent<-impute(merge_ds_unique$Markdown_percent, fun = mode)

sapply(merge_ds_unique, function(x) sum(is.na(x)))
##       Store_Date      Temperature       Fuel_Price        MarkDown1 
##                0                0                0                0 
##        MarkDown2        MarkDown3        MarkDown4        MarkDown5 
##                0                0                0                0 
##              CPI     Unemployment      IsHoliday.x             Dept 
##                0                0                0                0 
##     Weekly_Sales     tot_markdown Markdown_percent 
##                0                0                0

Scan II

#Scan numeric data for outliers and treat them


merge_ds_unique$Temperature %>% boxplot(main="Box Plot of Temperature", ylab="Temperature", col = "grey")

merge_ds_unique$Fuel_Price %>% boxplot(main="Box Plot of Fuel_Price", ylab="Fuel_Price", col = "grey")

merge_ds_unique$CPI %>% boxplot(main="Box Plot of CPI", ylab="CPI", col = "grey")

merge_ds_unique$Dept %>% boxplot(main="Box Plot of Dept", ylab="Dept", col = "grey")

merge_ds_unique$Weekly_Sales %>% boxplot(main="Box Plot of Weekly_Sales", ylab="Weekly_Sales", col = "grey")

Transform

#Transform attributes
Scale_Fuel_Price <- scale(merge_ds_unique$Fuel_Price, center = FALSE, scale = TRUE)
head(Scale_Fuel_Price)
##           [,1]
## [1,] 0.7582185
## [2,] 0.7582185
## [3,] 0.7582185
## [4,] 0.7582185
## [5,] 0.7582185
## [6,] 0.7582185
hist(merge_ds_unique$Temperature)

hist(merge_ds_unique$Dept)