Execute by Neha Raut
A retail giant needs to plan its new store openings over the next 12-18 months and has multiple locations where the stores can be opened. It wants to understand which locations would be the best to open new stores in terms of market size and potential revenues. Relevant data regarding locations, population, sales and revenues of key products for stores opened in the past was provided to be analyzed and for model building.
Evaluation Criteria: auc score for test data should come out to be more than 0.810
Predict if a store should be opened or not in that particular location
We have given you two datasets , store_train.csv and store_test.csv . You need to use data store_train to build predictive model for response variable ‘store’. store_test data contains all other factors except ‘store’, you need to predict that using the model that you developed and submit your predicted values[probability scores, not the hard classes] in a csv files.
Combining both train n test datasets prior to data preparation.
loading library dplyr
library(dplyr)
library(randomForest)
library(tidyr)
library(tree)
library(pROC)
library(cvTools)
library(car)
Read train and test datasets:
store_train=read.csv("store_train.csv",stringsAsFactors = F)#3338 obs and 17 variables
store_test=read.csv("store_test.csv",stringsAsFactors = F) #1431 obs and 16 variables
Combining both train n test datasets prior to data preparation.
Before combining however , we’ll need some placeholder column which we can use to differentiate between observations coming from train and test data. Also we’ll need to add a column for response to test data so that we have same columns in both train and test. We’ll fill test’s response column with NAs.
store_test$store=NA
store_train$data='train'
store_test$data='test'
store_all=rbind(store_train,store_test)
glimpse(store_all)
## Observations: 4,769
## Variables: 18
## $ Id <dbl> 2300919770, 5000129575, 2501308470, 603599999, ...
## $ sales0 <int> 848, 925, 924, 924, 1017, 1494, 691, 918, 931, ...
## $ sales1 <int> 588, 717, 616, 646, 730, 1071, 476, 663, 628, 4...
## $ sales2 <int> 666, 780, 739, 683, 735, 1196, 541, 774, 775, 4...
## $ sales3 <int> 1116, 1283, 1154, 1292, 1208, 1861, 861, 1189, ...
## $ sales4 <int> 1133, 1550, 1314, 1297, 1326, 2023, 923, 1477, ...
## $ country <int> 9, 1, 13, 35, 27, 9, 103, 183, 89, 57, 3, 109, ...
## $ State <int> 23, 50, 25, 6, 50, 25, 26, 37, 12, 5, 53, 28, 3...
## $ CouSub <int> 19770, 29575, 8470, 99999, 60100, 37995, 99999,...
## $ countyname <chr> "Hancock County", "Addison County", "Hampden Co...
## $ storecode <chr> "NCNTY23009N23009", "NCNTY50001N50001", "METRO4...
## $ Areaname <chr> "Hancock County, ME", "Addison County, VT", "Sp...
## $ countytownname <chr> "Eastbrook town", "Granville town", "Brimfield ...
## $ population <int> 423, 298, 3609, 34895, 1139, 5136, 67077, 90099...
## $ state_alpha <chr> "ME", "VT", "MA", "CA", "VT", "MA", "MI", "NC",...
## $ store_Type <chr> "Supermarket Type1", "Supermarket Type1", "Supe...
## $ store <int> 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,...
## $ data <chr> "train", "train", "train", "train", "train", "t...
we’ll drop state_alpha,countyname,countytownname and Areaname because they are having too many unique values.
sum(unique(table(store_all$state_alpha)))
## [1] 4499
store_all=store_all %>%
select(-state_alpha)
store_all=store_all %>%
select(-countyname)
store_all=store_all %>%
select(-countytownname)
store_all=store_all %>%
select(-Areaname)
Country have 1 NA and population have 2
store_all$population[is.na(store_all$population)]=round(mean(store_all$population,na.rm=T),0)
store_all$country[is.na(store_all$country)]=round(mean(store_all$country,na.rm=T),0)
convert char columns into categorical columns
CreateDummies=function(data,var,freq_cutoff=0){
t=table(data[,var])
t=t[t>freq_cutoff]
t=sort(t)
categories=names(t)[-1]
for( cat in categories){
name=paste(var,cat,sep="_")
name=gsub(" ","",name)
name=gsub("-","_",name)
name=gsub("\\?","Q",name)
name=gsub("<","LT_",name)
name=gsub("\\+","",name)
name=gsub("\\/","_",name)
name=gsub(">","GT_",name)
name=gsub("=","EQ_",name)
name=gsub(",","",name)
data[,name]=as.numeric(data[,var]==cat)
}
data[,var]=NULL
return(data)
}
char_logical=sapply(store_all,is.character)
cat_cols=names(store_all)[char_logical]
cat_cols=cat_cols[!(cat_cols %in% c('data','store'))]
cat_cols
## [1] "storecode" "store_Type"
create dummies for character vars
for(col in cat_cols){
store_all=CreateDummies(store_all,col,50)
}
glimpse(store_all)
## Observations: 4,769
## Variables: 18
## $ Id <dbl> 2300919770, 5000129575, 2501308470...
## $ sales0 <int> 848, 925, 924, 924, 1017, 1494, 69...
## $ sales1 <int> 588, 717, 616, 646, 730, 1071, 476...
## $ sales2 <int> 666, 780, 739, 683, 735, 1196, 541...
## $ sales3 <int> 1116, 1283, 1154, 1292, 1208, 1861...
## $ sales4 <int> 1133, 1550, 1314, 1297, 1326, 2023...
## $ country <dbl> 9, 1, 13, 35, 27, 9, 103, 183, 89,...
## $ State <int> 23, 50, 25, 6, 50, 25, 26, 37, 12,...
## $ CouSub <int> 19770, 29575, 8470, 99999, 60100, ...
## $ population <dbl> 423, 298, 3609, 34895, 1139, 5136,...
## $ store <int> 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1...
## $ data <chr> "train", "train", "train", "train"...
## $ storecode_METRO12620N23019 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ storecode_NCNTY23003N23003 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ storecode_METRO14460MM1120 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0...
## $ store_Type_SupermarketType3 <dbl> 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0...
## $ store_Type_GroceryStore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0...
## $ store_Type_SupermarketType1 <dbl> 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1...
Check is there any NA, if yes then remove
store_all=store_all[!((is.na(store_all$store)) & store_all$data=='train'), ]
for(col in names(store_all)){
if(sum(is.na(store_all[,col]))>0 & !(col %in% c("data","store"))){
store_all[is.na(store_all[,col]),col]=mean(store_all[store_all$data=='train',col],na.rm=T)
}
}
any(is.na(store_all))
## [1] TRUE
sum(is.na(store_all)) # For entire dataset
## [1] 1431
colSums(is.na(store_all)) #it is for response var
## Id sales0
## 0 0
## sales1 sales2
## 0 0
## sales3 sales4
## 0 0
## country State
## 0 0
## CouSub population
## 0 0
## store data
## 1431 0
## storecode_METRO12620N23019 storecode_NCNTY23003N23003
## 0 0
## storecode_METRO14460MM1120 store_Type_SupermarketType3
## 0 0
## store_Type_GroceryStore store_Type_SupermarketType1
## 0 0
Thus data preparation is done and we will now seperate both test n train data.
store_train=store_all %>% filter(data=='train') %>% select(-data)
store_test=store_all %>% filter(data=='test') %>% select(-data,-store)
We will use train for logistic regression model building and use train_25 to test the performance of the model thus built.
Lets build tree model on train dataset.
set.seed(2)
s=sample(1:nrow(store_train),0.75*nrow(store_train))
train_75=store_train[s,]
test_25=store_train[-s,]
Lets see how it performed on the validation set. Notice the difference how we get probability prediction from a tree model. By default it gives probability for both the classes , we only need one , thats why the square bracket at the end for subsetting.
Since it is classification problem , We will try logistic regression.
__ Logistic Regression__
#lets remove vars which have redundant information first on the basis of vif
for_vif=lm(store~.-Id-sales0-sales2-sales3-sales1,data=train_75)
sort(vif(for_vif),decreasing = T)[1:3]
## store_Type_SupermarketType1 store_Type_GroceryStore
## 2.508047 1.951587
## store_Type_SupermarketType3
## 1.833542
summary(for_vif)
##
## Call:
## lm(formula = store ~ . - Id - sales0 - sales2 - sales3 - sales1,
## data = train_75)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7094 -0.3630 -0.2652 0.5063 0.8359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.868e-01 6.685e-02 -2.794 0.00525 **
## sales4 4.338e-04 3.249e-05 13.351 < 2e-16 ***
## country 3.290e-05 1.058e-04 0.311 0.75581
## State 7.095e-04 6.292e-04 1.128 0.25960
## CouSub 5.111e-07 3.675e-07 1.391 0.16444
## population 9.077e-08 3.187e-08 2.849 0.00443 **
## storecode_METRO12620N23019 5.341e-01 1.012e-01 5.280 1.4e-07 ***
## storecode_NCNTY23003N23003 -4.564e-02 7.689e-02 -0.594 0.55283
## storecode_METRO14460MM1120 1.626e-01 6.419e-02 2.534 0.01134 *
## store_Type_SupermarketType3 -8.018e-03 4.008e-02 -0.200 0.84145
## store_Type_GroceryStore -2.897e-03 3.856e-02 -0.075 0.94011
## store_Type_SupermarketType1 1.780e-02 3.097e-02 0.575 0.56546
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.466 on 2491 degrees of freedom
## Multiple R-squared: 0.1185, Adjusted R-squared: 0.1146
## F-statistic: 30.43 on 11 and 2491 DF, p-value: < 2.2e-16
##Build Logistic Model
fit=glm(store~.-Id-sales0-sales2-sales3-sales1,data=train_75) #32 predictor var
fit=step(fit)
## Start: AIC=3295.11
## store ~ (Id + sales0 + sales1 + sales2 + sales3 + sales4 + country +
## State + CouSub + population + storecode_METRO12620N23019 +
## storecode_NCNTY23003N23003 + storecode_METRO14460MM1120 +
## store_Type_SupermarketType3 + store_Type_GroceryStore + store_Type_SupermarketType1) -
## Id - sales0 - sales2 - sales3 - sales1
##
## Df Deviance AIC
## - store_Type_GroceryStore 1 541.02 3293.1
## - store_Type_SupermarketType3 1 541.02 3293.2
## - country 1 541.03 3293.2
## - store_Type_SupermarketType1 1 541.09 3293.4
## - storecode_NCNTY23003N23003 1 541.09 3293.5
## - State 1 541.29 3294.4
## - CouSub 1 541.43 3295.1
## <none> 541.01 3295.1
## - storecode_METRO14460MM1120 1 542.41 3299.6
## - population 1 542.78 3301.3
## - storecode_METRO12620N23019 1 547.07 3321.0
## - sales4 1 579.72 3466.1
##
## Step: AIC=3293.12
## store ~ sales4 + country + State + CouSub + population + storecode_METRO12620N23019 +
## storecode_NCNTY23003N23003 + storecode_METRO14460MM1120 +
## store_Type_SupermarketType3 + store_Type_SupermarketType1
##
## Df Deviance AIC
## - store_Type_SupermarketType3 1 541.02 3291.2
## - country 1 541.04 3291.2
## - storecode_NCNTY23003N23003 1 541.09 3291.5
## - store_Type_SupermarketType1 1 541.18 3291.9
## - State 1 541.29 3292.4
## - CouSub 1 541.44 3293.1
## <none> 541.02 3293.1
## - storecode_METRO14460MM1120 1 542.41 3297.6
## - population 1 542.78 3299.3
## - storecode_METRO12620N23019 1 547.08 3319.0
## - sales4 1 579.73 3464.1
##
## Step: AIC=3291.15
## store ~ sales4 + country + State + CouSub + population + storecode_METRO12620N23019 +
## storecode_NCNTY23003N23003 + storecode_METRO14460MM1120 +
## store_Type_SupermarketType1
##
## Df Deviance AIC
## - country 1 541.04 3289.3
## - storecode_NCNTY23003N23003 1 541.10 3289.5
## - store_Type_SupermarketType1 1 541.28 3290.4
## - State 1 541.30 3290.4
## - CouSub 1 541.44 3291.1
## <none> 541.02 3291.2
## - storecode_METRO14460MM1120 1 542.42 3295.6
## - population 1 542.78 3297.3
## - storecode_METRO12620N23019 1 547.09 3317.1
## - sales4 1 579.79 3462.4
##
## Step: AIC=3289.25
## store ~ sales4 + State + CouSub + population + storecode_METRO12620N23019 +
## storecode_NCNTY23003N23003 + storecode_METRO14460MM1120 +
## store_Type_SupermarketType1
##
## Df Deviance AIC
## - storecode_NCNTY23003N23003 1 541.12 3287.6
## - store_Type_SupermarketType1 1 541.30 3288.4
## - State 1 541.35 3288.7
## <none> 541.04 3289.3
## - CouSub 1 541.60 3289.8
## - storecode_METRO14460MM1120 1 542.44 3293.7
## - population 1 542.80 3295.3
## - storecode_METRO12620N23019 1 547.11 3315.2
## - sales4 1 579.84 3460.6
##
## Step: AIC=3287.62
## store ~ sales4 + State + CouSub + population + storecode_METRO12620N23019 +
## storecode_METRO14460MM1120 + store_Type_SupermarketType1
##
## Df Deviance AIC
## - store_Type_SupermarketType1 1 541.37 3286.8
## - State 1 541.45 3287.1
## <none> 541.12 3287.6
## - CouSub 1 541.81 3288.8
## - storecode_METRO14460MM1120 1 542.51 3292.0
## - population 1 542.85 3293.6
## - storecode_METRO12620N23019 1 547.28 3313.9
## - sales4 1 581.84 3467.2
##
## Step: AIC=3286.76
## store ~ sales4 + State + CouSub + population + storecode_METRO12620N23019 +
## storecode_METRO14460MM1120
##
## Df Deviance AIC
## - State 1 541.70 3286.3
## <none> 541.37 3286.8
## - CouSub 1 542.06 3287.9
## - storecode_METRO14460MM1120 1 542.74 3291.1
## - population 1 543.13 3292.9
## - storecode_METRO12620N23019 1 547.52 3313.0
## - sales4 1 582.08 3466.2
##
## Step: AIC=3286.29
## store ~ sales4 + CouSub + population + storecode_METRO12620N23019 +
## storecode_METRO14460MM1120
##
## Df Deviance AIC
## <none> 541.70 3286.3
## - CouSub 1 542.42 3287.6
## - storecode_METRO14460MM1120 1 543.05 3290.5
## - population 1 543.40 3292.1
## - storecode_METRO12620N23019 1 547.73 3312.0
## - sales4 1 582.10 3464.3
summary(fit)
##
## Call:
## glm(formula = store ~ sales4 + CouSub + population + storecode_METRO12620N23019 +
## storecode_METRO14460MM1120, data = train_75)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7047 -0.3618 -0.2710 0.5076 0.8522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.625e-01 5.469e-02 -2.971 0.00300 **
## sales4 4.348e-04 3.186e-05 13.646 < 2e-16 ***
## CouSub 6.133e-07 3.358e-07 1.826 0.06794 .
## population 8.894e-08 3.177e-08 2.799 0.00516 **
## storecode_METRO12620N23019 5.317e-01 1.008e-01 5.274 1.45e-07 ***
## storecode_METRO14460MM1120 1.600e-01 6.411e-02 2.496 0.01262 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2169408)
##
## Null deviance: 613.72 on 2502 degrees of freedom
## Residual deviance: 541.70 on 2497 degrees of freedom
## AIC: 3286.3
##
## Number of Fisher Scoring iterations: 2
formula(fit)
## store ~ sales4 + CouSub + population + storecode_METRO12620N23019 +
## storecode_METRO14460MM1120
fit=glm(store ~ sales4 + CouSub + population + storecode_METRO12620N23019 +
storecode_METRO14460MM1120,data=train_75) #32 predictor var
library(pROC)
scoreLG=predict(fit,newdata =test_25,type = "response")
roccurve=roc(test_25$store,scoreLG)
auc(roccurve)
## Area under the curve: 0.7136
Try Decision Tree
DT= tree(as.factor(store)~.-Id,data=train_75)
DTscore=predict(DT,newdata=test_25,type="vector")[,2]
auc(roc(test_25$store,DTscore))
## Area under the curve: 0.7646
Try Random Forest
rf.model3= randomForest(as.factor(store)~.-Id,data=train_75)
test.score3=predict(rf.model3,newdata=test_25,type="prob")[,2]
auc(roc(test_25$store,test.score3))
## Area under the curve: 0.8166
Hence, Random Forest gives better score,Make predictions on test and submit
Now We will use Random Forest for Parameter tunning
library(cvTools)
store_train$store=as.factor(store_train$store)
glimpse(store_train)
## Observations: 3,338
## Variables: 17
## $ Id <dbl> 2300919770, 5000129575, 2501308470...
## $ sales0 <int> 848, 925, 924, 924, 1017, 1494, 69...
## $ sales1 <int> 588, 717, 616, 646, 730, 1071, 476...
## $ sales2 <int> 666, 780, 739, 683, 735, 1196, 541...
## $ sales3 <int> 1116, 1283, 1154, 1292, 1208, 1861...
## $ sales4 <int> 1133, 1550, 1314, 1297, 1326, 2023...
## $ country <dbl> 9, 1, 13, 35, 27, 9, 103, 183, 89,...
## $ State <int> 23, 50, 25, 6, 50, 25, 26, 37, 12,...
## $ CouSub <int> 19770, 29575, 8470, 99999, 60100, ...
## $ population <dbl> 423, 298, 3609, 34895, 1139, 5136,...
## $ store <fct> 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1...
## $ storecode_METRO12620N23019 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ storecode_NCNTY23003N23003 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ storecode_METRO14460MM1120 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0...
## $ store_Type_SupermarketType3 <dbl> 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0...
## $ store_Type_GroceryStore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0...
## $ store_Type_SupermarketType1 <dbl> 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1...
#Use full train data because here we are doing CV
#Parameter value we want to try out
#mtry: There will be upperlimit.Upperlimit means no of predictor in the data. Good idea is to start with 4 or 5 then go to no of variables in the data
#ntree:This is number of trees in the forest.There is no limit on it as such , a good starting point is 10 to 500 and you can try out values as large as 1000,5000. Although very high number of trees make sense when the data is huge as well. Default value is 500.
#maxnodes:start with 5 there is, there is no limiton this as such but good range to try can be between 1 to 20. Default value is 1.
#nodesize:There is no limit on this as such but good range to try can be between 1 to 20. Default value is 1.If values comes at edge then try to expand
param=list(mtry=c(3,4,6,8,10),
ntree=c(50,100,200,500,700,800,900),
maxnodes=c(5,10,15,20,30,50,100,300,500,600,700),
nodesize=c(1,2,5,10,20,30,40)
)
mycost_auc=function(store,yhat){ #Real #Predicted
roccurve=pROC::roc(store,yhat)
score=pROC::auc(roccurve)
return(score)
}
#We are looking at 5*7*11*7 combination. Hence it will took an hour to run
## Function for selecting random subset of params
subset_paras=function(full_list_para,n=10){ #n=10 is default, you can give higher value
all_comb=expand.grid(full_list_para)
s=sample(1:nrow(all_comb),n)
subset_para=all_comb[s,]
return(subset_para)
}
num_trial=100
my_params=subset_paras(param,num_trial)
my_params
## mtry ntree maxnodes nodesize
## 900 10 700 20 5
## 1068 6 500 500 5
## 2280 10 50 700 30
## 1750 10 900 50 20
## 2007 4 200 15 30
## 1173 6 500 5 10
## 249 8 50 300 1
## 2386 3 100 15 40
## 2291 3 500 700 30
## 327 4 200 600 1
## 363 6 200 700 1
## 1816 3 900 300 20
## 1496 3 800 600 10
## 1746 3 900 50 20
## 1076 3 800 500 5
## 264 8 500 300 1
## 1102 4 500 600 5
## 1333 6 50 50 10
## 988 6 100 100 5
## 944 8 900 30 5
## 718 6 500 600 2
## 1732 4 500 50 20
## 903 6 800 20 5
## 1508 6 50 700 10
## 873 6 900 15 5
## 1331 3 50 50 10
## 1164 8 100 5 10
## 531 3 100 30 2
## 1693 6 200 30 20
## 2518 6 900 50 40
## 1195 10 50 10 10
## 675 10 100 500 2
## 1177 4 700 5 10
## 1554 8 200 5 20
## 2273 6 900 600 30
## 130 10 700 20 1
## 1560 10 500 5 20
## 1351 3 700 50 10
## 849 8 100 15 5
## 1274 8 200 20 10
## 697 4 900 500 2
## 2309 8 900 700 30
## 188 6 200 50 1
## 1256 3 900 15 10
## 788 6 500 5 5
## 1118 6 900 600 5
## 452 4 900 10 2
## 730 10 800 600 2
## 1796 3 200 300 20
## 2008 6 200 15 30
## 455 10 900 10 2
## 1309 8 200 30 10
## 591 3 900 50 2
## 377 4 800 700 1
## 1458 6 700 500 10
## 524 8 900 20 2
## 152 4 200 30 1
## 406 3 700 5 2
## 1888 6 900 600 20
## 1543 6 50 5 20
## 1833 6 200 500 20
## 719 8 500 600 2
## 983 6 50 100 5
## 558 6 900 30 2
## 513 6 700 20 2
## 607 4 200 100 2
## 1895 10 50 700 20
## 15 10 200 5 1
## 2305 10 800 700 30
## 506 3 500 20 2
## 1738 6 700 50 20
## 2534 8 200 100 40
## 1852 4 900 500 20
## 1529 8 700 700 10
## 423 6 50 10 2
## 28 6 800 5 1
## 2342 4 900 5 40
## 51 3 500 10 1
## 1480 10 100 600 10
## 2649 8 700 600 40
## 2290 10 200 700 30
## 2015 10 500 15 30
## 1027 4 200 300 5
## 2324 8 200 5 40
## 685 10 500 500 2
## 105 10 900 15 1
## 248 6 50 300 1
## 2556 3 50 300 40
## 1253 6 800 15 10
## 1018 6 50 300 5
## 895 10 500 20 5
## 1078 6 800 500 5
## 1665 10 500 20 20
## 2063 6 900 20 30
## 2113 6 200 50 30
## 1971 3 200 10 30
## 1269 8 100 20 10
## 445 10 700 10 2
## 2511 3 800 50 40
## 1394 8 800 100 10
myauc=0
for(i in 1:num_trial){
#print(paste('starting iteration :',i))
# uncomment the line above to keep track of progress
params=my_params[i,]
k=cvTuning(randomForest,
store~.-Id,
data =store_train,
tuning =params,
folds = cvFolds(nrow(store_train), K=15, type ="random"),
cost =mycost_auc,
seed =2,
predictArgs = list(type="prob"))
score.this=k$cv[,2]
## It took almost hours to run because we are trying 2695 combinations
if(score.this>myauc){
#print(params)
#uncomment the line above to keep track of progress
myauc=score.this
#print(myauc)
#uncomment the line above to keep track of progress
#print(myauc)
best_params=params
}
#print('DONE')
}
myauc
## [1] 0.8184884
This is the tentative performance measure. The best paramters are,
best_params
## mtry ntree maxnodes nodesize
## 1076 3 800 500 5
ci.rf.final=randomForest(store~.-Id,
mtry=best_params$mtry,
ntree=best_params$ntree,
maxnodes=best_params$maxnodes,
nodesize=best_params$nodesize,
data=store_train
)
test.score_final=predict(ci.rf.final,newdata=store_test, type="prob")[,2]
write.csv(test.score_final,'Neha_Raut_P2_part2.csv',row.names = F)