library(DT) #Data Table HTML output
library(dummies) #one-hot encoding
library(rpart) #recursive partitioning
library(rpart.plot)
Data is from Analytics Vidya’s Big Mart Sales Prediction Challenge. Objective of the project is to apply PCA in predictive modelling and predict the Item_Outlet_Sales.
train <- read.csv(file = "bigmarttrain.csv")
test <- read.csv(file = "bigmarttest.csv")
test$Item_Outlet_Sales <- 1
combi <- rbind(train, test)
From the data preview, it seems that columns Item_Weight, Item_Visibility, and Outlet_Size have missing values that needs to be imputed.
#Item_Weight imputed with median
combi$Item_Weight[is.na(combi$Item_Weight)] <- median(combi$Item_Weight, na.rm = T)
#Item_Visibility 0 values imputed with median
combi$Item_Visibility <- ifelse(combi$Item_Visibility == 0 , median(combi$Item_Visibility, na.rm = T), combi$Item_Visibility)
#Outlet_Size missing label replaced with "others"
levels(combi$Outlet_Size)[1] <- "Other"
As PCA is unsupervised, unique identifiers and the response variable must be removed.
# remove unique identifiers and response variable
my_data <- subset(combi, select = -c(Item_Outlet_Sales, Item_Identifier, Outlet_Identifier))
The following output describes the structure of each feature. Since PCA only works for numerical, categorical variables needs to be converted to numerical.
## 'data.frame': 14204 obs. of 9 variables:
## $ Item_Weight : num 9.3 5.92 17.5 19.2 8.93 ...
## $ Item_Fat_Content : Factor w/ 5 levels "LF","Low Fat",..: 2 3 2 3 2 3 3 2 3 3 ...
## $ Item_Visibility : num 0.016 0.0193 0.0168 0.054 0.054 ...
## $ Item_Type : Factor w/ 16 levels "Baking Goods",..: 5 15 11 7 10 1 14 14 6 6 ...
## $ Item_MRP : num 249.8 48.3 141.6 182.1 53.9 ...
## $ Outlet_Establishment_Year: int 1999 2009 1999 1998 1987 2009 1987 1985 2002 2007 ...
## $ Outlet_Size : Factor w/ 4 levels "Other","High",..: 3 3 3 1 2 3 2 3 1 1 ...
## $ Outlet_Location_Type : Factor w/ 3 levels "Tier 1","Tier 2",..: 1 3 1 3 3 3 3 3 2 2 ...
## $ Outlet_Type : Factor w/ 4 levels "Grocery Store",..: 2 3 2 1 2 3 2 4 2 2 ...
Seems like 6 of 9 variables need to be converting using one hot encoding.
#creating dummy data frame with factors for one hot encoding
new_my_data <- dummy.data.frame(my_data, names = c("Item_Fat_Content","Item_Type",
"Outlet_Establishment_Year","Outlet_Size",
"Outlet_Location_Type","Outlet_Type"))
Check the new dataset after one hot encoding:
## 'data.frame': 14204 obs. of 44 variables:
## $ Item_Weight : num 9.3 5.92 17.5 19.2 8.93 ...
## $ Item_Fat_ContentLF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Item_Fat_ContentLow Fat : int 1 0 1 0 1 0 0 1 0 0 ...
## $ Item_Fat_ContentRegular : int 0 1 0 1 0 1 1 0 1 1 ...
## $ Item_Fat_Contentlow fat : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Item_Fat_Contentreg : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Item_Visibility : num 0.016 0.0193 0.0168 0.054 0.054 ...
## $ Item_TypeBaking Goods : int 0 0 0 0 0 1 0 0 0 0 ...
## $ Item_TypeBreads : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Item_TypeBreakfast : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Item_TypeCanned : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Item_TypeDairy : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Item_TypeFrozen Foods : int 0 0 0 0 0 0 0 0 1 1 ...
## $ Item_TypeFruits and Vegetables: int 0 0 0 1 0 0 0 0 0 0 ...
## $ Item_TypeHard Drinks : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Item_TypeHealth and Hygiene : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Item_TypeHousehold : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Item_TypeMeat : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Item_TypeOthers : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Item_TypeSeafood : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Item_TypeSnack Foods : int 0 0 0 0 0 0 1 1 0 0 ...
## $ Item_TypeSoft Drinks : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Item_TypeStarchy Foods : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Item_MRP : num 249.8 48.3 141.6 182.1 53.9 ...
## $ Outlet_Establishment_Year1985 : int 0 0 0 0 0 0 0 1 0 0 ...
## $ Outlet_Establishment_Year1987 : int 0 0 0 0 1 0 1 0 0 0 ...
## $ Outlet_Establishment_Year1997 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Outlet_Establishment_Year1998 : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Outlet_Establishment_Year1999 : int 1 0 1 0 0 0 0 0 0 0 ...
## $ Outlet_Establishment_Year2002 : int 0 0 0 0 0 0 0 0 1 0 ...
## $ Outlet_Establishment_Year2004 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Outlet_Establishment_Year2007 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Outlet_Establishment_Year2009 : int 0 1 0 0 0 1 0 0 0 0 ...
## $ Outlet_SizeOther : int 0 0 0 1 0 0 0 0 1 1 ...
## $ Outlet_SizeHigh : int 0 0 0 0 1 0 1 0 0 0 ...
## $ Outlet_SizeMedium : int 1 1 1 0 0 1 0 1 0 0 ...
## $ Outlet_SizeSmall : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Outlet_Location_TypeTier 1 : int 1 0 1 0 0 0 0 0 0 0 ...
## $ Outlet_Location_TypeTier 2 : int 0 0 0 0 0 0 0 0 1 1 ...
## $ Outlet_Location_TypeTier 3 : int 0 1 0 1 1 1 1 1 0 0 ...
## $ Outlet_TypeGrocery Store : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Outlet_TypeSupermarket Type1 : int 1 0 1 0 1 0 1 0 1 1 ...
## $ Outlet_TypeSupermarket Type2 : int 0 1 0 0 0 1 0 0 0 0 ...
## $ Outlet_TypeSupermarket Type3 : int 0 0 0 0 0 0 0 1 0 0 ...
## - attr(*, "dummies")=List of 6
## ..$ Item_Fat_Content : int 2 3 4 5 6
## ..$ Item_Type : int 8 9 10 11 12 13 14 15 16 17 ...
## ..$ Outlet_Establishment_Year: int 25 26 27 28 29 30 31 32 33
## ..$ Outlet_Size : int 34 35 36 37
## ..$ Outlet_Location_Type : int 38 39 40
## ..$ Outlet_Type : int 41 42 43 44
The dataset is now ready for PCA
#Dividing data
pca.train <- new_my_data[1:nrow(train), ]
pca.test <- new_my_data[-(1:nrow(train)),]
#PCA with scaling
prin_comp <- prcomp(pca.train, scale. = T)
The biplot helps to show corresponding measures of the principal components.
Here, the variance of the first 10 components are checked
std_dev <- prin_comp$sdev
pr_var <- std_dev^2
prop_varex <- pr_var/sum(pr_var)
prop_varex[1:20]
## [1] 0.10374160 0.07316543 0.06237493 0.05763369 0.04978294 0.04594636
## [7] 0.04397749 0.02877979 0.02740357 0.02657943 0.02631063 0.02577906
## [13] 0.02549683 0.02511309 0.02477200 0.02469167 0.02460995 0.02446028
## [19] 0.02398721 0.02373467
Cumulative scree plot helps to access components or factors which explains the most of variability in the data.
This plot shows that 30 components results in variance close to ~ 98%. Therefore, in this case, we’ll select number of components as 30 [PC1 to PC30] and proceed to the modeling stage. This completes the steps to implement PCA on train data. For modeling, we’ll use these 30 components as predictor variables and follow the normal procedures.
train.data <- data.frame(Item_Outlet_Sales = train$Item_Outlet_Sales, prin_comp$x)
#First 30 PCAs
train.data <- train.data[ , 1:31]
#Train decision tree on outlet sales again 30 components of train data
rpart.model <- rpart(Item_Outlet_Sales ~ . , data = train.data, method = "anova")
## Warning in sample.int(length(x), size, replace, prob): '.Random.seed' is
## not an integer vector but of type 'NULL', so ignored
rpart.plot(rpart.model)
#Transform test into PCA by using princomp from train to pca.test. newdata looks for which variable to predict
test.data <- predict(prin_comp, newdata = pca.test)
#predicited PCA with test one hot encoded dataset
test.data <- as.data.frame(test.data)
#First 30 components
test.data <- test.data[,1:30]
#Prediction on test data. Predicts the outlet sales based on the components of the test data
rpart.prediction <- predict(rpart.model, test.data)