library(caret)
library(plyr)
library(dplyr)
library(dummies)
library(mlr)
library(rpart)
library(rpart.plot)
library(caret)
library(e1071)
library(Metrics)
library(randomForest)
train = read.csv("train.csv",na.strings = c(""," ",NA,"NA"))
test = read.csv("test.csv",na.strings = c(""," ",NA,"NA"))
summary(train)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## FDG33 : 10 Min. : 4.555 LF : 316 Min. :0.00000
## FDW13 : 10 1st Qu.: 8.774 low fat: 112 1st Qu.:0.02699
## DRE49 : 9 Median :12.600 Low Fat:5089 Median :0.05393
## DRN47 : 9 Mean :12.858 reg : 117 Mean :0.06613
## FDD38 : 9 3rd Qu.:16.850 Regular:2889 3rd Qu.:0.09459
## FDF52 : 9 Max. :21.350 Max. :0.32839
## (Other):8467 NA's :1463
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:1232 Min. : 31.29 OUT027 : 935
## Snack Foods :1200 1st Qu.: 93.83 OUT013 : 932
## Household : 910 Median :143.01 OUT035 : 930
## Frozen Foods : 856 Mean :140.99 OUT046 : 930
## Dairy : 682 3rd Qu.:185.64 OUT049 : 930
## Canned : 649 Max. :266.89 OUT045 : 929
## (Other) :2994 (Other):2937
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 High : 932 Tier 1:2388
## 1st Qu.:1987 Medium:2793 Tier 2:2785
## Median :1999 Small :2388 Tier 3:3350
## Mean :1998 NA's :2410
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales
## Grocery Store :1083 Min. : 33.29
## Supermarket Type1:5577 1st Qu.: 834.25
## Supermarket Type2: 928 Median : 1794.33
## Supermarket Type3: 935 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
##
str(train)
## 'data.frame': 8523 obs. of 12 variables:
## $ Item_Identifier : Factor w/ 1559 levels "DRA12","DRA24",..: 157 9 663 1122 1298 759 697 739 441 991 ...
## $ Item_Weight : num 9.3 5.92 17.5 19.2 8.93 ...
## $ Item_Fat_Content : Factor w/ 5 levels "LF","low fat",..: 3 5 3 5 3 5 5 3 5 5 ...
## $ Item_Visibility : num 0.016 0.0193 0.0168 0 0 ...
## $ Item_Type : Factor w/ 16 levels "Baking Goods",..: 5 15 11 7 10 1 14 14 6 6 ...
## $ Item_MRP : num 249.8 48.3 141.6 182.1 53.9 ...
## $ Outlet_Identifier : Factor w/ 10 levels "OUT010","OUT013",..: 10 4 10 1 2 4 2 6 8 3 ...
## $ Outlet_Establishment_Year: int 1999 2009 1999 1998 1987 2009 1987 1985 2002 2007 ...
## $ Outlet_Size : Factor w/ 3 levels "High","Medium",..: 2 2 2 NA 1 2 1 2 NA NA ...
## $ Outlet_Location_Type : Factor w/ 3 levels "Tier 1","Tier 2",..: 1 3 1 3 3 3 3 3 2 2 ...
## $ Outlet_Type : Factor w/ 4 levels "Grocery Store",..: 2 3 2 1 2 3 2 4 2 2 ...
## $ Item_Outlet_Sales : num 3735 443 2097 732 995 ...
summary(test)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## DRF48 : 8 Min. : 4.555 LF : 206 Min. :0.00000
## FDK57 : 8 1st Qu.: 8.645 low fat: 66 1st Qu.:0.02705
## FDN52 : 8 Median :12.500 Low Fat:3396 Median :0.05415
## FDP15 : 8 Mean :12.696 reg : 78 Mean :0.06568
## FDQ60 : 8 3rd Qu.:16.700 Regular:1935 3rd Qu.:0.09346
## FDW10 : 8 Max. :21.350 Max. :0.32364
## (Other):5633 NA's :976
## Item_Type Item_MRP Outlet_Identifier
## Snack Foods : 789 Min. : 31.99 OUT027 : 624
## Fruits and Vegetables: 781 1st Qu.: 94.41 OUT013 : 621
## Household : 638 Median :141.42 OUT035 : 620
## Frozen Foods : 570 Mean :141.02 OUT046 : 620
## Dairy : 454 3rd Qu.:186.03 OUT049 : 620
## Baking Goods : 438 Max. :266.59 OUT045 : 619
## (Other) :2011 (Other):1957
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 High : 621 Tier 1:1592
## 1st Qu.:1987 Medium:1862 Tier 2:1856
## Median :1999 Small :1592 Tier 3:2233
## Mean :1998 NA's :1606
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type
## Grocery Store : 722
## Supermarket Type1:3717
## Supermarket Type2: 618
## Supermarket Type3: 624
##
##
##
str(test)
## 'data.frame': 5681 obs. of 11 variables:
## $ Item_Identifier : Factor w/ 1543 levels "DRA12","DRA24",..: 1104 1068 1407 810 1185 462 605 267 669 171 ...
## $ Item_Weight : num 20.75 8.3 14.6 7.32 NA ...
## $ Item_Fat_Content : Factor w/ 5 levels "LF","low fat",..: 3 4 3 3 5 5 5 3 5 3 ...
## $ Item_Visibility : num 0.00756 0.03843 0.09957 0.01539 0.1186 ...
## $ Item_Type : Factor w/ 16 levels "Baking Goods",..: 14 5 12 14 5 7 1 1 14 1 ...
## $ Item_MRP : num 107.9 87.3 241.8 155 234.2 ...
## $ Outlet_Identifier : Factor w/ 10 levels "OUT010","OUT013",..: 10 3 1 3 6 9 4 6 8 3 ...
## $ Outlet_Establishment_Year: int 1999 2007 1998 2007 1985 1997 2009 1985 2002 2007 ...
## $ Outlet_Size : Factor w/ 3 levels "High","Medium",..: 2 NA NA NA 2 3 2 2 NA NA ...
## $ Outlet_Location_Type : Factor w/ 3 levels "Tier 1","Tier 2",..: 1 2 3 2 3 1 3 3 2 2 ...
## $ Outlet_Type : Factor w/ 4 levels "Grocery Store",..: 2 2 1 2 4 2 3 4 2 2 ...
Infrences drawn from data exploration :-
Factor mismatch in Item_Fat_Content.
Missing values in Item_Weight and Outlet_Size.
Minimum value of Item_Visibility is 0,which is not practically possible.Hence,we’ll deal them as missing values.
boxplot(train$Item_MRP,main = "Boxplot of Item MRP")
boxplot(train$Item_Weight,main = "Boxplot of Item Weight")
boxplot(train$Item_Visibility,main = "Boxplot of Item Visibility")
ggplot(train,aes(x=Item_Visibility,y=Item_Outlet_Sales)) + geom_point(color = "red") + ggtitle("Item Visibility vs Item Outlet Sales")
ggplot(train,aes(x=Outlet_Identifier,y= Item_Outlet_Sales)) + geom_bar(stat="identity",color = "blue") + ggtitle("Outlet vs Outlet Sales") + theme(axis.text.x = element_text(angle = 70,vjust = 0.5,color = "blue"))
ggplot(train,aes(x=Item_Type,y= Item_Outlet_Sales)) + geom_bar(stat="identity",color = "purple") + ggtitle("Item Type vs Item Sales") + theme(axis.text.x = element_text(angle = 70,vjust = 0.5,color = "black"))
## Dealing with categorical and continuous variables
We will use median imputation to deal with continuous missing values
test$Item_Outlet_Sales = 1
comb = rbind(train,test)
comb$Item_Weight[is.na(comb$Item_Weight)] = median(comb$Item_Weight,na.rm = T)
comb$Item_Visibility = ifelse(comb$Item_Visibility==0,median(comb$Item_Visibility),comb$Item_Visibility)
comb$Outlet_Size = ifelse(is.na(comb$Outlet_Size),"Others",comb$Outlet_Size)
comb$Outlet_Size = as.factor(comb$Outlet_Size)
levels(comb$Outlet_Size)[1] = "High"
levels(comb$Outlet_Size)[2] = "Medium"
levels(comb$Outlet_Size)[3] = "Low"
table(comb$Item_Fat_Content)
##
## LF low fat Low Fat reg Regular
## 522 178 8485 195 4824
comb$Item_Fat_Content = revalue(comb$Item_Fat_Content,c("LF" = "Low Fat","reg"="Regular"))
comb$Item_Fat_Content = revalue(comb$Item_Fat_Content,c("low fat"="Low Fat"))
table(comb$Item_Fat_Content)
##
## Low Fat Regular
## 9185 5019
temp = comb%>%group_by(Outlet_Identifier)%>%tally()
names(temp)[2] = "Outlet_Count"
comb = full_join(comb,temp,by = "Outlet_Identifier")
temp1 = comb%>%group_by(Item_Identifier)%>%tally()
names(temp1)[2] = "Item_Count"
comb = merge(comb,temp1,by = "Item_Identifier")
temp2 = comb%>%select(Outlet_Establishment_Year)%>%mutate(Outlet_Year = 2013 - comb$Outlet_Establishment_Year)
temp2$Outlet_Establishment_Year = NULL
comb = cbind(comb,temp2 )
items = substr(comb$Item_Identifier,1,2)
items = gsub("FD","Food",items)
items = gsub("DR","Drinks",items)
items = gsub("NC","Non Consumable",items)
comb$Item_Type_New = factor(items)
str(comb)
## 'data.frame': 14204 obs. of 16 variables:
## $ Item_Identifier : Factor w/ 1559 levels "DRA12","DRA24",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ Item_Weight : num 11.6 11.6 11.6 11.6 12.6 ...
## $ Item_Fat_Content : Factor w/ 2 levels "Low Fat","Regular": 1 1 1 1 1 1 1 1 1 2 ...
## $ Item_Visibility : num 0.054 0.041 0.054 0.0409 0.0407 ...
## $ Item_Type : Factor w/ 16 levels "Baking Goods",..: 15 15 15 15 15 15 15 15 15 15 ...
## $ Item_MRP : num 142 141 142 143 140 ...
## $ Outlet_Identifier : Factor w/ 10 levels "OUT010","OUT013",..: 7 10 8 9 6 4 1 2 3 9 ...
## $ Outlet_Establishment_Year: int 2004 1999 2002 1997 1985 2009 1998 1987 2007 1997 ...
## $ Outlet_Size : Factor w/ 4 levels "High","Medium",..: 3 2 4 3 2 2 4 1 4 3 ...
## $ Outlet_Location_Type : Factor w/ 3 levels "Tier 1","Tier 2",..: 2 1 2 1 3 3 3 3 2 1 ...
## $ Outlet_Type : Factor w/ 4 levels "Grocery Store",..: 2 2 2 2 4 3 1 2 2 2 ...
## $ Item_Outlet_Sales : num 993 1 3829 1 1 ...
## $ Outlet_Count : int 1550 1550 1548 1550 1559 1546 925 1553 1543 1550 ...
## $ Item_Count : int 9 9 9 9 9 9 9 9 9 10 ...
## $ Outlet_Year : num 9 14 11 16 28 4 15 26 6 16 ...
## $ Item_Type_New : Factor w/ 3 levels "Drinks","Food",..: 1 1 1 1 1 1 1 1 1 1 ...
comb = dummy.data.frame(comb,names = c("Outlet_Size","Outlet_Location_Type","Outlet_Type","Item_Type_New","Item_Fat_Content"),sep='_')
str(comb)
## 'data.frame': 14204 obs. of 27 variables:
## $ Item_Identifier : Factor w/ 1559 levels "DRA12","DRA24",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ Item_Weight : num 11.6 11.6 11.6 11.6 12.6 ...
## $ Item_Fat_Content_Low Fat : int 1 1 1 1 1 1 1 1 1 0 ...
## $ Item_Fat_Content_Regular : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Item_Visibility : num 0.054 0.041 0.054 0.0409 0.0407 ...
## $ Item_Type : Factor w/ 16 levels "Baking Goods",..: 15 15 15 15 15 15 15 15 15 15 ...
## $ Item_MRP : num 142 141 142 143 140 ...
## $ Outlet_Identifier : Factor w/ 10 levels "OUT010","OUT013",..: 7 10 8 9 6 4 1 2 3 9 ...
## $ Outlet_Establishment_Year : int 2004 1999 2002 1997 1985 2009 1998 1987 2007 1997 ...
## $ Outlet_Size_High : int 0 0 0 0 0 0 0 1 0 0 ...
## $ Outlet_Size_Medium : int 0 1 0 0 1 1 0 0 0 0 ...
## $ Outlet_Size_Low : int 1 0 0 1 0 0 0 0 0 1 ...
## $ Outlet_Size_Others : int 0 0 1 0 0 0 1 0 1 0 ...
## $ Outlet_Location_Type_Tier 1 : int 0 1 0 1 0 0 0 0 0 1 ...
## $ Outlet_Location_Type_Tier 2 : int 1 0 1 0 0 0 0 0 1 0 ...
## $ Outlet_Location_Type_Tier 3 : int 0 0 0 0 1 1 1 1 0 0 ...
## $ Outlet_Type_Grocery Store : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Outlet_Type_Supermarket Type1: int 1 1 1 1 0 0 0 1 1 1 ...
## $ Outlet_Type_Supermarket Type2: int 0 0 0 0 0 1 0 0 0 0 ...
## $ Outlet_Type_Supermarket Type3: int 0 0 0 0 1 0 0 0 0 0 ...
## $ Item_Outlet_Sales : num 993 1 3829 1 1 ...
## $ Outlet_Count : int 1550 1550 1548 1550 1559 1546 925 1553 1543 1550 ...
## $ Item_Count : int 9 9 9 9 9 9 9 9 9 10 ...
## $ Outlet_Year : num 9 14 11 16 28 4 15 26 6 16 ...
## $ Item_Type_New_Drinks : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Item_Type_New_Food : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Item_Type_New_Non Consumable : int 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "dummies")=List of 5
## ..$ Item_Fat_Content : int 3 4
## ..$ Outlet_Size : int 10 11 12 13
## ..$ Outlet_Location_Type: int 14 15 16
## ..$ Outlet_Type : int 17 18 19 20
## ..$ Item_Type_New : int 25 26 27
comb = select(comb,-c(Item_Identifier,Outlet_Identifier,Item_Type,Outlet_Establishment_Year))
new_train = comb[1:nrow(train),]
new_test = comb[-(1:nrow(train)),]
names(new_train) = make.names(names(new_train))
names(new_test) = make.names(names(new_test))
linear_model = lm(Item_Outlet_Sales ~ . ,data = new_train)
summary(linear_model)
##
## Call:
## lm(formula = Item_Outlet_Sales ~ ., data = new_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3436.9 -1096.4 -43.0 791.8 8883.7
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.611e+04 2.202e+04 1.640 0.1011
## Item_Weight 2.461e+00 3.980e+00 0.618 0.5363
## Item_Fat_Content_Low.Fat -5.059e+01 3.508e+01 -1.442 0.1493
## Item_Fat_Content_Regular NA NA NA NA
## Item_Visibility -1.572e+02 3.501e+02 -0.449 0.6535
## Item_MRP 9.524e+00 2.631e-01 36.204 < 2e-16 ***
## Outlet_Size_High -9.254e+02 6.064e+02 -1.526 0.1270
## Outlet_Size_Medium 2.335e+02 1.132e+02 2.063 0.0392 *
## Outlet_Size_Low 1.868e+02 8.841e+01 2.113 0.0346 *
## Outlet_Size_Others NA NA NA NA
## Outlet_Location_Type_Tier.1 -1.222e+03 7.101e+02 -1.721 0.0854 .
## Outlet_Location_Type_Tier.2 -1.083e+03 7.101e+02 -1.525 0.1274
## Outlet_Location_Type_Tier.3 NA NA NA NA
## Outlet_Type_Grocery.Store -1.628e+04 8.919e+03 -1.825 0.0680 .
## Outlet_Type_Supermarket.Type1 2.058e+02 5.888e+02 0.350 0.7267
## Outlet_Type_Supermarket.Type2 -1.314e+03 1.972e+02 -6.661 2.89e-11 ***
## Outlet_Type_Supermarket.Type3 NA NA NA NA
## Outlet_Count -2.285e+01 1.417e+01 -1.613 0.1068
## Item_Count 1.781e+01 2.327e+01 0.765 0.4441
## Outlet_Year NA NA NA NA
## Item_Type_New_Drinks -1.663e+01 4.787e+01 -0.347 0.7283
## Item_Type_New_Food NA NA NA NA
## Item_Type_New_Non.Consumable NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1526 on 8507 degrees of freedom
## Multiple R-squared: 0.2097, Adjusted R-squared: 0.2083
## F-statistic: 150.5 on 15 and 8507 DF, p-value: < 2.2e-16
pred_lm = predict(linear_model,type = "response")
rmse(new_train$Item_Outlet_Sales,pred_lm)
## [1] 1524.375
tree_model = rpart(Item_Outlet_Sales ~ . ,data = new_test)
summary(tree_model)
## Call:
## rpart(formula = Item_Outlet_Sales ~ ., data = new_test)
## n= 5681
##
## CP nsplit rel error xerror xstd
## 1 0.08646909 0 1.0000000 1.0001613 0.02827382
## 2 0.05420419 1 0.9135309 0.9141970 0.02420009
## 3 0.02443891 2 0.8593267 0.8597872 0.02284921
## 4 0.01316430 3 0.8348878 0.8366061 0.02050304
## 5 0.01224056 4 0.8217235 0.8246613 0.02025643
## 6 0.01210477 5 0.8094829 0.8229437 0.02023515
## 7 0.01000000 6 0.7973782 0.8076124 0.01986044
##
## Variable importance
## Item_MRP Outlet_Count
## 33 27
## Outlet_Type_Grocery.Store Outlet_Type_Supermarket.Type3
## 20 7
## Outlet_Year Item_Visibility
## 7 3
## Item_Weight Item_Count
## 1 1
##
## Node number 1: 5681 observations, complexity param=0.08646909
## mean=1293.524, MSE=2809010
## left son=2 (2896 obs) right son=3 (2785 obs)
## Primary splits:
## Item_MRP < 143.797 to the left, improve=0.08646909, (0 missing)
## Outlet_Count < 1234 to the left, improve=0.06113592, (0 missing)
## Outlet_Type_Grocery.Store < 0.5 to the right, improve=0.06113592, (0 missing)
## Outlet_Type_Supermarket.Type3 < 0.5 to the left, improve=0.03814215, (0 missing)
## Outlet_Size_Medium < 0.5 to the left, improve=0.01847886, (0 missing)
## Surrogate splits:
## Item_Weight < 13.05 to the left, agree=0.533, adj=0.047, (0 split)
## Item_Visibility < 0.05837035 to the left, agree=0.522, adj=0.025, (0 split)
## Item_Count < 8.5 to the right, agree=0.520, adj=0.021, (0 split)
## Item_Fat_Content_Low.Fat < 0.5 to the right, agree=0.515, adj=0.010, (0 split)
## Item_Fat_Content_Regular < 0.5 to the left, agree=0.515, adj=0.010, (0 split)
##
## Node number 2: 2896 observations, complexity param=0.0131643
## mean=810.2202, MSE=1049318
## left son=4 (386 obs) right son=5 (2510 obs)
## Primary splits:
## Outlet_Type_Grocery.Store < 0.5 to the right, improve=0.06913057, (0 missing)
## Outlet_Count < 1234 to the left, improve=0.06913057, (0 missing)
## Item_MRP < 76.6512 to the left, improve=0.05935699, (0 missing)
## Outlet_Type_Supermarket.Type3 < 0.5 to the left, improve=0.04433924, (0 missing)
## Outlet_Size_Medium < 0.5 to the left, improve=0.02583729, (0 missing)
## Surrogate splits:
## Outlet_Count < 1234 to the left, agree=1.000, adj=1.000, (0 split)
## Item_Visibility < 0.1756642 to the right, agree=0.885, adj=0.135, (0 split)
##
## Node number 3: 2785 observations, complexity param=0.05420419
## mean=1796.091, MSE=4143372
## left son=6 (349 obs) right son=7 (2436 obs)
## Primary splits:
## Outlet_Count < 1234 to the left, improve=0.07496041, (0 missing)
## Outlet_Type_Grocery.Store < 0.5 to the right, improve=0.07496041, (0 missing)
## Outlet_Type_Supermarket.Type3 < 0.5 to the left, improve=0.04774003, (0 missing)
## Outlet_Size_Medium < 0.5 to the left, improve=0.02061265, (0 missing)
## Item_MRP < 220.0456 to the left, improve=0.01811604, (0 missing)
## Surrogate splits:
## Outlet_Type_Grocery.Store < 0.5 to the right, agree=1.000, adj=1.000, (0 split)
## Item_Visibility < 0.1896654 to the right, agree=0.889, adj=0.112, (0 split)
##
## Node number 4: 386 observations
## mean=123.4176, MSE=23738.38
##
## Node number 5: 2510 observations, complexity param=0.01210477
## mean=915.84, MSE=1123341
## left son=10 (1020 obs) right son=11 (1490 obs)
## Primary splits:
## Item_MRP < 88.6185 to the left, improve=0.06850924, (0 missing)
## Outlet_Type_Supermarket.Type3 < 0.5 to the left, improve=0.03339728, (0 missing)
## Outlet_Count < 1556 to the left, improve=0.03339728, (0 missing)
## Outlet_Year < 27 to the left, improve=0.03339728, (0 missing)
## Outlet_Type_Supermarket.Type1 < 0.5 to the right, improve=0.01175536, (0 missing)
## Surrogate splits:
## Item_Visibility < 0.01236591 to the left, agree=0.599, adj=0.013, (0 split)
## Item_Count < 7.5 to the left, agree=0.596, adj=0.007, (0 split)
##
## Node number 6: 349 observations
## mean=323.7147, MSE=113127.9
##
## Node number 7: 2436 observations, complexity param=0.02443891
## mean=2007.035, MSE=4365689
## left son=14 (2132 obs) right son=15 (304 obs)
## Primary splits:
## Outlet_Count < 1556 to the left, improve=0.036671610, (0 missing)
## Outlet_Year < 27 to the left, improve=0.036671610, (0 missing)
## Outlet_Type_Supermarket.Type3 < 0.5 to the left, improve=0.036671610, (0 missing)
## Item_MRP < 220.3285 to the left, improve=0.021380880, (0 missing)
## Outlet_Type_Supermarket.Type1 < 0.5 to the right, improve=0.009034658, (0 missing)
## Surrogate splits:
## Outlet_Type_Supermarket.Type3 < 0.5 to the left, agree=1.000, adj=1.00, (0 split)
## Outlet_Year < 27 to the left, agree=1.000, adj=1.00, (0 split)
## Outlet_Type_Supermarket.Type1 < 0.5 to the right, agree=0.876, adj=0.01, (0 split)
##
## Node number 10: 1020 observations
## mean=580.5478, MSE=431194.4
##
## Node number 11: 1490 observations
## mean=1145.369, MSE=1467517
##
## Node number 14: 2132 observations, complexity param=0.01224056
## mean=1855.945, MSE=3569990
## left son=28 (1443 obs) right son=29 (689 obs)
## Primary splits:
## Item_MRP < 199.0584 to the left, improve=0.025664040, (0 missing)
## Item_Weight < 6.6925 to the left, improve=0.003830461, (0 missing)
## Outlet_Count < 1549 to the left, improve=0.003305032, (0 missing)
## Item_Visibility < 0.1859728 to the left, improve=0.003216576, (0 missing)
## Outlet_Year < 7.5 to the left, improve=0.002334160, (0 missing)
## Surrogate splits:
## Item_Weight < 5.0725 to the right, agree=0.684, adj=0.023, (0 split)
## Item_Count < 7.5 to the right, agree=0.682, adj=0.017, (0 split)
## Item_Visibility < 0.1806009 to the left, agree=0.680, adj=0.010, (0 split)
##
## Node number 15: 304 observations
## mean=3066.65, MSE=8663172
##
## Node number 28: 1443 observations
## mean=1646.788, MSE=2745550
##
## Node number 29: 689 observations
## mean=2293.991, MSE=5013142
prp(tree_model)
pred_tree = predict(tree_model,type= "vector")
rmse(new_train$Item_Outlet_Sales,pred_tree)
## Warning in actual - predicted: longer object length is not a multiple of
## shorter object length
## [1] 1899.484
rf_model = randomForest(Item_Outlet_Sales ~ . ,data = new_train,mtry = 2 ,ntree = 1000)
summary(rf_model)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 8523 -none- numeric
## mse 1000 -none- numeric
## rsq 1000 -none- numeric
## oob.times 8523 -none- numeric
## importance 22 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 8523 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
varImpPlot(rf_model)
pred_rf = predict(rf_model,type="response")
rmse(new_train$Item_Outlet_Sales,pred_rf)
## [1] 1566.205