Rpubs link:
library(readr)
library(tidyr)
library(dplyr)
library(Hmisc)
library(outliers)
library(xlsx)
library(readxl)
library(foreign)
library(gdata)
library(rvest)
library(stringr)
library(knitr)
Our report includes different data preprocessing techniques, starting from loading all the required packages and the data, The first step of preprocessing is based on understanding of data through which we demonstrated the data dimensions which shows the total rows and columns of the R object.We used structure function(str) which provides the information use to view the variable view tab. The class function has been used to identify the variety of data types including scalars, vectors (numerical, character or logical) it also includes matrices, dataframes, lists etc.By using the as.factor we converted the columns into factor column for the promotional markdowns and the last function we used for understanding was str function which changed the markdown to factor to numeric. The second task was to tidy the data in which we checked the summary of the data and removed all the NA’s in the data and imputation function to replace the mising values, We scanned numeric data for outliers and treated them whch can be seen in the boxplot.The last task was to transformed the attributes in which transformed the attributes and showed the temperature and dept transformation through histograms.
The data sets were sourced from:https://www.kaggle.com/c/walmart-recruiting-store-sales-forecasting/data, there were two datsets used for this project which were the historical sales data of 45 wallmart stores situated in different regions. The dataset contained 15 columns and 421570 collectively, the variables in the train.csv dataset were: Store - the store number Dept - the department number Date - the week Weekly_Sales - sales for the given department in the given store IsHoliday - whether the week is a special holiday week
The variables in the features.csv dataset were: Store - the store number Date - the week Temperature - average temperature in the region Fuel_Price - cost of fuel in the region MarkDown1-5 - anonymized data related to promotional markdowns that Walmart is running. MarkDown data is only available after Nov 2011, and is not available for all stores all the time. Any missing value is marked with an NA. CPI - the consumer price index Unemployment - the unemployment rate IsHoliday - whether the week is a special holida
getwd()
## [1] "/Users/mohammadrazzak/Documents/University/RMIT/data preprocessing/Assignment 3"
setwd("/Users/mohammadrazzak/Documents/University/RMIT/data preprocessing/Assignment 3")
features <- read_csv("features.csv")
## Parsed with column specification:
## cols(
## Store = col_integer(),
## Date = col_date(format = ""),
## Temperature = col_double(),
## Fuel_Price = col_double(),
## MarkDown1 = col_double(),
## MarkDown2 = col_double(),
## MarkDown3 = col_double(),
## MarkDown4 = col_double(),
## MarkDown5 = col_double(),
## CPI = col_double(),
## Unemployment = col_double(),
## IsHoliday = col_logical()
## )
train <- read_csv("train.csv")
## Parsed with column specification:
## cols(
## Store = col_integer(),
## Dept = col_integer(),
## Date = col_date(format = ""),
## Weekly_Sales = col_double(),
## IsHoliday = col_logical()
## )
#Merge the datasets
#based on "Store" and "Date"
merge_ds <- merge(features,train,by=c("Store","Date"))
#describing data
dim(merge_ds)
## [1] 421570 15
#types of all variables
str(merge_ds)
## 'data.frame': 421570 obs. of 15 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2010-02-05" "2010-02-05" ...
## $ Temperature : num 42.3 42.3 42.3 42.3 42.3 ...
## $ Fuel_Price : num 2.57 2.57 2.57 2.57 2.57 ...
## $ MarkDown1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown4 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
## $ IsHoliday.x : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Dept : int 1 26 17 45 28 79 55 5 58 7 ...
## $ Weekly_Sales: num 24924.5 11737.1 13223.8 37.4 1085.3 ...
## $ IsHoliday.y : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
class(merge_ds$Store)
## [1] "integer"
class(merge_ds$Date)
## [1] "Date"
class(merge_ds$Temperature)
## [1] "numeric"
class(merge_ds$Fuel_Price)
## [1] "numeric"
class(merge_ds$MarkDown1)
## [1] "numeric"
class(merge_ds$MarkDown2)
## [1] "numeric"
class(merge_ds$MarkDown3)
## [1] "numeric"
class(merge_ds$MarkDown4)
## [1] "numeric"
class(merge_ds$MarkDown5)
## [1] "numeric"
class(merge_ds$CPI)
## [1] "numeric"
class(merge_ds$Unemployment)
## [1] "numeric"
class(merge_ds$IsHoliday.x)
## [1] "logical"
class(merge_ds$Dept)
## [1] "integer"
class(merge_ds$Weekly_Sales)
## [1] "numeric"
class(merge_ds$IsHoliday.y)
## [1] "logical"
#convet a column into a factor column
merge_ds$Store<-as.factor(merge_ds$Store)
merge_ds$MarkDown1<-as.factor(merge_ds$MarkDown1)
merge_ds$MarkDown2<-as.factor(merge_ds$MarkDown2)
merge_ds$MarkDown3<-as.factor(merge_ds$MarkDown3)
merge_ds$MarkDown4<-as.factor(merge_ds$MarkDown4)
merge_ds$MarkDown5<-as.factor(merge_ds$MarkDown5)
str(merge_ds)
## 'data.frame': 421570 obs. of 15 variables:
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2010-02-05" "2010-02-05" ...
## $ Temperature : num 42.3 42.3 42.3 42.3 42.3 ...
## $ Fuel_Price : num 2.57 2.57 2.57 2.57 2.57 ...
## $ MarkDown1 : Factor w/ 2277 levels "0.27","0.5","1.5",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown2 : Factor w/ 1499 levels "-265.76","-192",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown3 : Factor w/ 1662 levels "-29.1","-1","-0.87",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown4 : Factor w/ 1944 levels "0.22","0.41",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown5 : Factor w/ 2293 levels "135.16","153.04",..: NA NA NA NA NA NA NA NA NA NA ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
## $ IsHoliday.x : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Dept : int 1 26 17 45 28 79 55 5 58 7 ...
## $ Weekly_Sales: num 24924.5 11737.1 13223.8 37.4 1085.3 ...
## $ IsHoliday.y : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
#Determining if data is tidy <>
#Joining "Store" and "Date" to make a unique coloumn in the merged dataset
merge_ds_unique <- unite(merge_ds, "Store_Date", Store, Date, sep = "_")
#removing duplicate columns)
merge_ds_unique$IsHoliday.y <- NULL
#Creating a varialble "tot_markdown" which is sum of markdown1 to markdown5
merge_ds_unique$tot_markdown <- merge_ds_unique$MarkDown1 + merge_ds_unique$MarkDown2 + merge_ds_unique$MarkDown3 + merge_ds_unique$MarkDown4 + merge_ds_unique$MarkDown5
## Warning in Ops.factor(merge_ds_unique$MarkDown1, merge_ds_unique
## $MarkDown2): '+' not meaningful for factors
## Warning in Ops.factor(merge_ds_unique$MarkDown1 + merge_ds_unique
## $MarkDown2, : '+' not meaningful for factors
## Warning in Ops.factor(merge_ds_unique$MarkDown1 + merge_ds_unique$MarkDown2
## + : '+' not meaningful for factors
## Warning in Ops.factor(merge_ds_unique$MarkDown1 + merge_ds_unique$MarkDown2
## + : '+' not meaningful for factors
#Calculting Markdown percent = (tot_markdown/weekly_sales)*100
merge_ds_unique$Markdown_percent <- (merge_ds_unique$tot_markdown/merge_ds_unique$Weekly_Sales)*100
#Checking for inconsistencies and missing values and treating them
summary(merge_ds_unique)
## Store_Date Temperature Fuel_Price MarkDown1
## Length:421570 Min. : -2.06 Min. :2.472 1.5 : 102
## Class :character 1st Qu.: 46.68 1st Qu.:2.933 460.73 : 102
## Mode :character Median : 62.09 Median :3.452 175.64 : 93
## Mean : 60.09 Mean :3.361 686.24 : 75
## 3rd Qu.: 74.28 3rd Qu.:3.738 1282.42: 75
## Max. :100.14 Max. :4.468 (Other):150234
## NA's :270889
## MarkDown2 MarkDown3 MarkDown4 MarkDown5
## 1.91 : 539 3 : 754 9 : 280 2743.18: 136
## 3 : 493 6 : 710 4 : 200 1064.56: 120
## 0.5 : 485 2 : 660 2 : 197 3557.67: 75
## 1.5 : 471 1 : 611 3 : 146 3567.03: 75
## 4 : 367 0.22 : 487 47 : 143 4180.29: 75
## (Other):108893 (Other):133869 (Other):134001 (Other):150951
## NA's :310322 NA's :284479 NA's :286603 NA's :270138
## CPI Unemployment IsHoliday.x Dept
## Min. :126.1 Min. : 3.879 Mode :logical Min. : 1.00
## 1st Qu.:132.0 1st Qu.: 6.891 FALSE:391909 1st Qu.:18.00
## Median :182.3 Median : 7.866 TRUE :29661 Median :37.00
## Mean :171.2 Mean : 7.960 Mean :44.26
## 3rd Qu.:212.4 3rd Qu.: 8.572 3rd Qu.:74.00
## Max. :227.2 Max. :14.313 Max. :99.00
##
## Weekly_Sales tot_markdown Markdown_percent
## Min. : -4989 Mode:logical Min. : NA
## 1st Qu.: 2080 NA's:421570 1st Qu.: NA
## Median : 7612 Median : NA
## Mean : 15981 Mean :NaN
## 3rd Qu.: 20206 3rd Qu.: NA
## Max. :693099 Max. : NA
## NA's :421570
sapply(merge_ds_unique, function(x) sum(is.na(x)))
## Store_Date Temperature Fuel_Price MarkDown1
## 0 0 0 270889
## MarkDown2 MarkDown3 MarkDown4 MarkDown5
## 310322 284479 286603 270138
## CPI Unemployment IsHoliday.x Dept
## 0 0 0 0
## Weekly_Sales tot_markdown Markdown_percent
## 0 421570 421570
#EXPLANATION
merge_ds_unique$MarkDown1<-impute(merge_ds_unique$MarkDown1, fun = mode)
merge_ds_unique$MarkDown2<-impute(merge_ds_unique$MarkDown2, fun = mode)
merge_ds_unique$MarkDown3<-impute(merge_ds_unique$MarkDown3, fun = mode)
merge_ds_unique$MarkDown4<-impute(merge_ds_unique$MarkDown4, fun = mode)
merge_ds_unique$MarkDown5<-impute(merge_ds_unique$MarkDown5, fun = mode)
merge_ds_unique$tot_markdown<-impute(merge_ds_unique$tot_markdown, fun = mode)
merge_ds_unique$Markdown_percent<-impute(merge_ds_unique$Markdown_percent, fun = mode)
sapply(merge_ds_unique, function(x) sum(is.na(x)))
## Store_Date Temperature Fuel_Price MarkDown1
## 0 0 0 0
## MarkDown2 MarkDown3 MarkDown4 MarkDown5
## 0 0 0 0
## CPI Unemployment IsHoliday.x Dept
## 0 0 0 0
## Weekly_Sales tot_markdown Markdown_percent
## 0 0 0
#Scan numeric data for outliers and treat them
merge_ds_unique$Temperature %>% boxplot(main="Box Plot of Temperature", ylab="Temperature", col = "grey")
merge_ds_unique$Fuel_Price %>% boxplot(main="Box Plot of Fuel_Price", ylab="Fuel_Price", col = "grey")
merge_ds_unique$CPI %>% boxplot(main="Box Plot of CPI", ylab="CPI", col = "grey")
merge_ds_unique$Dept %>% boxplot(main="Box Plot of Dept", ylab="Dept", col = "grey")
merge_ds_unique$Weekly_Sales %>% boxplot(main="Box Plot of Weekly_Sales", ylab="Weekly_Sales", col = "grey")
#Transform attributes
Scale_Fuel_Price <- scale(merge_ds_unique$Fuel_Price, center = FALSE, scale = TRUE)
head(Scale_Fuel_Price)
## [,1]
## [1,] 0.7582185
## [2,] 0.7582185
## [3,] 0.7582185
## [4,] 0.7582185
## [5,] 0.7582185
## [6,] 0.7582185
hist(merge_ds_unique$Temperature)
hist(merge_ds_unique$Dept)