Execute by Neha Raut

Problem Statement:

A retail giant needs to plan its new store openings over the next 12-18 months and has multiple locations where the stores can be opened. It wants to understand which locations would be the best to open new stores in terms of market size and potential revenues. Relevant data regarding locations, population, sales and revenues of key products for stores opened in the past was provided to be analyzed and for model building.

Evaluation Criteria: auc score for test data should come out to be more than 0.810

Aim

Predict if a store should be opened or not in that particular location

Data Information:

We have given you two datasets , store_train.csv and store_test.csv . You need to use data store_train to build predictive model for response variable ‘store’. store_test data contains all other factors except ‘store’, you need to predict that using the model that you developed and submit your predicted values[probability scores, not the hard classes] in a csv files.

Initial Setup

Combining both train n test datasets prior to data preparation.

loading library dplyr

library(dplyr) 
library(randomForest) 
library(tidyr)
library(tree)
library(pROC)
library(cvTools)
library(car)

Step 1: Read File

Read train and test datasets:

store_train=read.csv("store_train.csv",stringsAsFactors = F)#3338 obs and 17 variables
store_test=read.csv("store_test.csv",stringsAsFactors = F) #1431 obs and 16 variables

Step 2: Step 2:Data Preparation

Combining both train n test datasets prior to data preparation.

Before combining however , we’ll need some placeholder column which we can use to differentiate between observations coming from train and test data. Also we’ll need to add a column for response to test data so that we have same columns in both train and test. We’ll fill test’s response column with NAs.

store_test$store=NA
store_train$data='train'
store_test$data='test'
store_all=rbind(store_train,store_test)
glimpse(store_all)

## Observations: 4,769
## Variables: 18
## $ Id             <dbl> 2300919770, 5000129575, 2501308470, 603599999, ...
## $ sales0         <int> 848, 925, 924, 924, 1017, 1494, 691, 918, 931, ...
## $ sales1         <int> 588, 717, 616, 646, 730, 1071, 476, 663, 628, 4...
## $ sales2         <int> 666, 780, 739, 683, 735, 1196, 541, 774, 775, 4...
## $ sales3         <int> 1116, 1283, 1154, 1292, 1208, 1861, 861, 1189, ...
## $ sales4         <int> 1133, 1550, 1314, 1297, 1326, 2023, 923, 1477, ...
## $ country        <int> 9, 1, 13, 35, 27, 9, 103, 183, 89, 57, 3, 109, ...
## $ State          <int> 23, 50, 25, 6, 50, 25, 26, 37, 12, 5, 53, 28, 3...
## $ CouSub         <int> 19770, 29575, 8470, 99999, 60100, 37995, 99999,...
## $ countyname     <chr> "Hancock County", "Addison County", "Hampden Co...
## $ storecode      <chr> "NCNTY23009N23009", "NCNTY50001N50001", "METRO4...
## $ Areaname       <chr> "Hancock County, ME", "Addison County, VT", "Sp...
## $ countytownname <chr> "Eastbrook town", "Granville town", "Brimfield ...
## $ population     <int> 423, 298, 3609, 34895, 1139, 5136, 67077, 90099...
## $ state_alpha    <chr> "ME", "VT", "MA", "CA", "VT", "MA", "MI", "NC",...
## $ store_Type     <chr> "Supermarket Type1", "Supermarket Type1", "Supe...
## $ store          <int> 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,...
## $ data           <chr> "train", "train", "train", "train", "train", "t...

we’ll drop state_alpha,countyname,countytownname and Areaname because they are having too many unique values.

sum(unique(table(store_all$state_alpha)))

## [1] 4499

store_all=store_all %>% 
  select(-state_alpha)

store_all=store_all %>% 
  select(-countyname)

store_all=store_all %>% 
  select(-countytownname)

store_all=store_all %>% 
  select(-Areaname)

Country have 1 NA and population have 2

store_all$population[is.na(store_all$population)]=round(mean(store_all$population,na.rm=T),0)
store_all$country[is.na(store_all$country)]=round(mean(store_all$country,na.rm=T),0)

convert char columns into categorical columns

CreateDummies=function(data,var,freq_cutoff=0){
  t=table(data[,var])
  t=t[t>freq_cutoff]
  t=sort(t)
  categories=names(t)[-1]
  
  for( cat in categories){
    name=paste(var,cat,sep="_")
    name=gsub(" ","",name)
    name=gsub("-","_",name)
    name=gsub("\\?","Q",name)
    name=gsub("<","LT_",name)
    name=gsub("\\+","",name)
    name=gsub("\\/","_",name)
    name=gsub(">","GT_",name)
    name=gsub("=","EQ_",name)
    name=gsub(",","",name)
    
    data[,name]=as.numeric(data[,var]==cat)
  }
  
  data[,var]=NULL
  return(data)
}

char_logical=sapply(store_all,is.character)
cat_cols=names(store_all)[char_logical]
cat_cols=cat_cols[!(cat_cols %in% c('data','store'))]
cat_cols

## [1] "storecode"  "store_Type"

create dummies for character vars

for(col in cat_cols){
  store_all=CreateDummies(store_all,col,50)
}

glimpse(store_all)

## Observations: 4,769
## Variables: 18
## $ Id                          <dbl> 2300919770, 5000129575, 2501308470...
## $ sales0                      <int> 848, 925, 924, 924, 1017, 1494, 69...
## $ sales1                      <int> 588, 717, 616, 646, 730, 1071, 476...
## $ sales2                      <int> 666, 780, 739, 683, 735, 1196, 541...
## $ sales3                      <int> 1116, 1283, 1154, 1292, 1208, 1861...
## $ sales4                      <int> 1133, 1550, 1314, 1297, 1326, 2023...
## $ country                     <dbl> 9, 1, 13, 35, 27, 9, 103, 183, 89,...
## $ State                       <int> 23, 50, 25, 6, 50, 25, 26, 37, 12,...
## $ CouSub                      <int> 19770, 29575, 8470, 99999, 60100, ...
## $ population                  <dbl> 423, 298, 3609, 34895, 1139, 5136,...
## $ store                       <int> 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1...
## $ data                        <chr> "train", "train", "train", "train"...
## $ storecode_METRO12620N23019  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ storecode_NCNTY23003N23003  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ storecode_METRO14460MM1120  <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0...
## $ store_Type_SupermarketType3 <dbl> 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0...
## $ store_Type_GroceryStore     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0...
## $ store_Type_SupermarketType1 <dbl> 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1...

Check is there any NA, if yes then remove

store_all=store_all[!((is.na(store_all$store)) & store_all$data=='train'), ]
for(col in names(store_all)){
  if(sum(is.na(store_all[,col]))>0 & !(col %in% c("data","store"))){
    store_all[is.na(store_all[,col]),col]=mean(store_all[store_all$data=='train',col],na.rm=T)
  }
}

any(is.na(store_all))

## [1] TRUE

sum(is.na(store_all)) # For entire dataset

## [1] 1431

colSums(is.na(store_all)) #it is for response var

##                          Id                      sales0 
##                           0                           0 
##                      sales1                      sales2 
##                           0                           0 
##                      sales3                      sales4 
##                           0                           0 
##                     country                       State 
##                           0                           0 
##                      CouSub                  population 
##                           0                           0 
##                       store                        data 
##                        1431                           0 
##  storecode_METRO12620N23019  storecode_NCNTY23003N23003 
##                           0                           0 
##  storecode_METRO14460MM1120 store_Type_SupermarketType3 
##                           0                           0 
##     store_Type_GroceryStore store_Type_SupermarketType1 
##                           0                           0

Thus data preparation is done and we will now seperate both test n train data.

store_train=store_all %>% filter(data=='train') %>% select(-data)
store_test=store_all %>% filter(data=='test') %>% select(-data,-store)

Step 3: Model Building

We will use train for logistic regression model building and use train_25 to test the performance of the model thus built.

Lets build tree model on train dataset.

set.seed(2)
s=sample(1:nrow(store_train),0.75*nrow(store_train))
train_75=store_train[s,] 
test_25=store_train[-s,]

Lets see how it performed on the validation set. Notice the difference how we get probability prediction from a tree model. By default it gives probability for both the classes , we only need one , thats why the square bracket at the end for subsetting.

Since it is classification problem , We will try logistic regression.

__ Logistic Regression__

#lets remove vars which have redundant information first on the basis of vif
for_vif=lm(store~.-Id-sales0-sales2-sales3-sales1,data=train_75)
sort(vif(for_vif),decreasing = T)[1:3]

## store_Type_SupermarketType1     store_Type_GroceryStore 
##                    2.508047                    1.951587 
## store_Type_SupermarketType3 
##                    1.833542

summary(for_vif)

## 
## Call:
## lm(formula = store ~ . - Id - sales0 - sales2 - sales3 - sales1, 
##     data = train_75)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7094 -0.3630 -0.2652  0.5063  0.8359 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -1.868e-01  6.685e-02  -2.794  0.00525 ** 
## sales4                       4.338e-04  3.249e-05  13.351  < 2e-16 ***
## country                      3.290e-05  1.058e-04   0.311  0.75581    
## State                        7.095e-04  6.292e-04   1.128  0.25960    
## CouSub                       5.111e-07  3.675e-07   1.391  0.16444    
## population                   9.077e-08  3.187e-08   2.849  0.00443 ** 
## storecode_METRO12620N23019   5.341e-01  1.012e-01   5.280  1.4e-07 ***
## storecode_NCNTY23003N23003  -4.564e-02  7.689e-02  -0.594  0.55283    
## storecode_METRO14460MM1120   1.626e-01  6.419e-02   2.534  0.01134 *  
## store_Type_SupermarketType3 -8.018e-03  4.008e-02  -0.200  0.84145    
## store_Type_GroceryStore     -2.897e-03  3.856e-02  -0.075  0.94011    
## store_Type_SupermarketType1  1.780e-02  3.097e-02   0.575  0.56546    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.466 on 2491 degrees of freedom
## Multiple R-squared:  0.1185, Adjusted R-squared:  0.1146 
## F-statistic: 30.43 on 11 and 2491 DF,  p-value: < 2.2e-16

##Build Logistic Model

fit=glm(store~.-Id-sales0-sales2-sales3-sales1,data=train_75) #32 predictor var
fit=step(fit)

## Start:  AIC=3295.11
## store ~ (Id + sales0 + sales1 + sales2 + sales3 + sales4 + country + 
##     State + CouSub + population + storecode_METRO12620N23019 + 
##     storecode_NCNTY23003N23003 + storecode_METRO14460MM1120 + 
##     store_Type_SupermarketType3 + store_Type_GroceryStore + store_Type_SupermarketType1) - 
##     Id - sales0 - sales2 - sales3 - sales1
## 
##                               Df Deviance    AIC
## - store_Type_GroceryStore      1   541.02 3293.1
## - store_Type_SupermarketType3  1   541.02 3293.2
## - country                      1   541.03 3293.2
## - store_Type_SupermarketType1  1   541.09 3293.4
## - storecode_NCNTY23003N23003   1   541.09 3293.5
## - State                        1   541.29 3294.4
## - CouSub                       1   541.43 3295.1
## <none>                             541.01 3295.1
## - storecode_METRO14460MM1120   1   542.41 3299.6
## - population                   1   542.78 3301.3
## - storecode_METRO12620N23019   1   547.07 3321.0
## - sales4                       1   579.72 3466.1
## 
## Step:  AIC=3293.12
## store ~ sales4 + country + State + CouSub + population + storecode_METRO12620N23019 + 
##     storecode_NCNTY23003N23003 + storecode_METRO14460MM1120 + 
##     store_Type_SupermarketType3 + store_Type_SupermarketType1
## 
##                               Df Deviance    AIC
## - store_Type_SupermarketType3  1   541.02 3291.2
## - country                      1   541.04 3291.2
## - storecode_NCNTY23003N23003   1   541.09 3291.5
## - store_Type_SupermarketType1  1   541.18 3291.9
## - State                        1   541.29 3292.4
## - CouSub                       1   541.44 3293.1
## <none>                             541.02 3293.1
## - storecode_METRO14460MM1120   1   542.41 3297.6
## - population                   1   542.78 3299.3
## - storecode_METRO12620N23019   1   547.08 3319.0
## - sales4                       1   579.73 3464.1
## 
## Step:  AIC=3291.15
## store ~ sales4 + country + State + CouSub + population + storecode_METRO12620N23019 + 
##     storecode_NCNTY23003N23003 + storecode_METRO14460MM1120 + 
##     store_Type_SupermarketType1
## 
##                               Df Deviance    AIC
## - country                      1   541.04 3289.3
## - storecode_NCNTY23003N23003   1   541.10 3289.5
## - store_Type_SupermarketType1  1   541.28 3290.4
## - State                        1   541.30 3290.4
## - CouSub                       1   541.44 3291.1
## <none>                             541.02 3291.2
## - storecode_METRO14460MM1120   1   542.42 3295.6
## - population                   1   542.78 3297.3
## - storecode_METRO12620N23019   1   547.09 3317.1
## - sales4                       1   579.79 3462.4
## 
## Step:  AIC=3289.25
## store ~ sales4 + State + CouSub + population + storecode_METRO12620N23019 + 
##     storecode_NCNTY23003N23003 + storecode_METRO14460MM1120 + 
##     store_Type_SupermarketType1
## 
##                               Df Deviance    AIC
## - storecode_NCNTY23003N23003   1   541.12 3287.6
## - store_Type_SupermarketType1  1   541.30 3288.4
## - State                        1   541.35 3288.7
## <none>                             541.04 3289.3
## - CouSub                       1   541.60 3289.8
## - storecode_METRO14460MM1120   1   542.44 3293.7
## - population                   1   542.80 3295.3
## - storecode_METRO12620N23019   1   547.11 3315.2
## - sales4                       1   579.84 3460.6
## 
## Step:  AIC=3287.62
## store ~ sales4 + State + CouSub + population + storecode_METRO12620N23019 + 
##     storecode_METRO14460MM1120 + store_Type_SupermarketType1
## 
##                               Df Deviance    AIC
## - store_Type_SupermarketType1  1   541.37 3286.8
## - State                        1   541.45 3287.1
## <none>                             541.12 3287.6
## - CouSub                       1   541.81 3288.8
## - storecode_METRO14460MM1120   1   542.51 3292.0
## - population                   1   542.85 3293.6
## - storecode_METRO12620N23019   1   547.28 3313.9
## - sales4                       1   581.84 3467.2
## 
## Step:  AIC=3286.76
## store ~ sales4 + State + CouSub + population + storecode_METRO12620N23019 + 
##     storecode_METRO14460MM1120
## 
##                              Df Deviance    AIC
## - State                       1   541.70 3286.3
## <none>                            541.37 3286.8
## - CouSub                      1   542.06 3287.9
## - storecode_METRO14460MM1120  1   542.74 3291.1
## - population                  1   543.13 3292.9
## - storecode_METRO12620N23019  1   547.52 3313.0
## - sales4                      1   582.08 3466.2
## 
## Step:  AIC=3286.29
## store ~ sales4 + CouSub + population + storecode_METRO12620N23019 + 
##     storecode_METRO14460MM1120
## 
##                              Df Deviance    AIC
## <none>                            541.70 3286.3
## - CouSub                      1   542.42 3287.6
## - storecode_METRO14460MM1120  1   543.05 3290.5
## - population                  1   543.40 3292.1
## - storecode_METRO12620N23019  1   547.73 3312.0
## - sales4                      1   582.10 3464.3

summary(fit)

## 
## Call:
## glm(formula = store ~ sales4 + CouSub + population + storecode_METRO12620N23019 + 
##     storecode_METRO14460MM1120, data = train_75)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7047  -0.3618  -0.2710   0.5076   0.8522  
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -1.625e-01  5.469e-02  -2.971  0.00300 ** 
## sales4                      4.348e-04  3.186e-05  13.646  < 2e-16 ***
## CouSub                      6.133e-07  3.358e-07   1.826  0.06794 .  
## population                  8.894e-08  3.177e-08   2.799  0.00516 ** 
## storecode_METRO12620N23019  5.317e-01  1.008e-01   5.274 1.45e-07 ***
## storecode_METRO14460MM1120  1.600e-01  6.411e-02   2.496  0.01262 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2169408)
## 
##     Null deviance: 613.72  on 2502  degrees of freedom
## Residual deviance: 541.70  on 2497  degrees of freedom
## AIC: 3286.3
## 
## Number of Fisher Scoring iterations: 2

formula(fit)

## store ~ sales4 + CouSub + population + storecode_METRO12620N23019 + 
##     storecode_METRO14460MM1120

fit=glm(store ~ sales4 + CouSub + population + storecode_METRO12620N23019 + 
          storecode_METRO14460MM1120,data=train_75) #32 predictor var

library(pROC)
scoreLG=predict(fit,newdata =test_25,type = "response")
roccurve=roc(test_25$store,scoreLG) 
auc(roccurve)

## Area under the curve: 0.7136

Try Decision Tree

DT= tree(as.factor(store)~.-Id,data=train_75)

DTscore=predict(DT,newdata=test_25,type="vector")[,2]
auc(roc(test_25$store,DTscore))

## Area under the curve: 0.7646

Try Random Forest

rf.model3= randomForest(as.factor(store)~.-Id,data=train_75)
test.score3=predict(rf.model3,newdata=test_25,type="prob")[,2]
auc(roc(test_25$store,test.score3))

## Area under the curve: 0.8166

Hence, Random Forest gives better score,Make predictions on test and submit

Now We will use Random Forest for Parameter tunning

library(cvTools)

store_train$store=as.factor(store_train$store)
glimpse(store_train)

## Observations: 3,338
## Variables: 17
## $ Id                          <dbl> 2300919770, 5000129575, 2501308470...
## $ sales0                      <int> 848, 925, 924, 924, 1017, 1494, 69...
## $ sales1                      <int> 588, 717, 616, 646, 730, 1071, 476...
## $ sales2                      <int> 666, 780, 739, 683, 735, 1196, 541...
## $ sales3                      <int> 1116, 1283, 1154, 1292, 1208, 1861...
## $ sales4                      <int> 1133, 1550, 1314, 1297, 1326, 2023...
## $ country                     <dbl> 9, 1, 13, 35, 27, 9, 103, 183, 89,...
## $ State                       <int> 23, 50, 25, 6, 50, 25, 26, 37, 12,...
## $ CouSub                      <int> 19770, 29575, 8470, 99999, 60100, ...
## $ population                  <dbl> 423, 298, 3609, 34895, 1139, 5136,...
## $ store                       <fct> 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1...
## $ storecode_METRO12620N23019  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ storecode_NCNTY23003N23003  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ storecode_METRO14460MM1120  <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0...
## $ store_Type_SupermarketType3 <dbl> 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0...
## $ store_Type_GroceryStore     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0...
## $ store_Type_SupermarketType1 <dbl> 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1...

#Use full train data because here we are doing CV

#Parameter value we want to try out
#mtry: There will be upperlimit.Upperlimit means no of predictor in the data. Good idea is to start with 4 or 5 then go to no of variables in the data
#ntree:This is number of trees in the forest.There is no limit on it as such , a good starting point is 10 to 500 and you can try out values as large as 1000,5000. Although very high number of trees make sense when the data is huge as well. Default value is 500.
#maxnodes:start with 5 there is, there is no limiton this as such but good range to try can be between 1 to 20. Default value is 1.
#nodesize:There is no limit on this as such but good range to try can be between 1 to 20. Default value is 1.If values comes at edge then try to expand

param=list(mtry=c(3,4,6,8,10),
           ntree=c(50,100,200,500,700,800,900), 
           maxnodes=c(5,10,15,20,30,50,100,300,500,600,700),
           nodesize=c(1,2,5,10,20,30,40)       
)



mycost_auc=function(store,yhat){  #Real #Predicted
  roccurve=pROC::roc(store,yhat)
  score=pROC::auc(roccurve)
  return(score)
}  


#We are looking at 5*7*11*7 combination. Hence it will took an hour to run

## Function for selecting random subset of params


subset_paras=function(full_list_para,n=10){  #n=10 is default, you can give higher value
  
  all_comb=expand.grid(full_list_para)
  
  s=sample(1:nrow(all_comb),n)
  
  subset_para=all_comb[s,]
  
  return(subset_para)
}

num_trial=100
my_params=subset_paras(param,num_trial)
my_params

##      mtry ntree maxnodes nodesize
## 900    10   700       20        5
## 1068    6   500      500        5
## 2280   10    50      700       30
## 1750   10   900       50       20
## 2007    4   200       15       30
## 1173    6   500        5       10
## 249     8    50      300        1
## 2386    3   100       15       40
## 2291    3   500      700       30
## 327     4   200      600        1
## 363     6   200      700        1
## 1816    3   900      300       20
## 1496    3   800      600       10
## 1746    3   900       50       20
## 1076    3   800      500        5
## 264     8   500      300        1
## 1102    4   500      600        5
## 1333    6    50       50       10
## 988     6   100      100        5
## 944     8   900       30        5
## 718     6   500      600        2
## 1732    4   500       50       20
## 903     6   800       20        5
## 1508    6    50      700       10
## 873     6   900       15        5
## 1331    3    50       50       10
## 1164    8   100        5       10
## 531     3   100       30        2
## 1693    6   200       30       20
## 2518    6   900       50       40
## 1195   10    50       10       10
## 675    10   100      500        2
## 1177    4   700        5       10
## 1554    8   200        5       20
## 2273    6   900      600       30
## 130    10   700       20        1
## 1560   10   500        5       20
## 1351    3   700       50       10
## 849     8   100       15        5
## 1274    8   200       20       10
## 697     4   900      500        2
## 2309    8   900      700       30
## 188     6   200       50        1
## 1256    3   900       15       10
## 788     6   500        5        5
## 1118    6   900      600        5
## 452     4   900       10        2
## 730    10   800      600        2
## 1796    3   200      300       20
## 2008    6   200       15       30
## 455    10   900       10        2
## 1309    8   200       30       10
## 591     3   900       50        2
## 377     4   800      700        1
## 1458    6   700      500       10
## 524     8   900       20        2
## 152     4   200       30        1
## 406     3   700        5        2
## 1888    6   900      600       20
## 1543    6    50        5       20
## 1833    6   200      500       20
## 719     8   500      600        2
## 983     6    50      100        5
## 558     6   900       30        2
## 513     6   700       20        2
## 607     4   200      100        2
## 1895   10    50      700       20
## 15     10   200        5        1
## 2305   10   800      700       30
## 506     3   500       20        2
## 1738    6   700       50       20
## 2534    8   200      100       40
## 1852    4   900      500       20
## 1529    8   700      700       10
## 423     6    50       10        2
## 28      6   800        5        1
## 2342    4   900        5       40
## 51      3   500       10        1
## 1480   10   100      600       10
## 2649    8   700      600       40
## 2290   10   200      700       30
## 2015   10   500       15       30
## 1027    4   200      300        5
## 2324    8   200        5       40
## 685    10   500      500        2
## 105    10   900       15        1
## 248     6    50      300        1
## 2556    3    50      300       40
## 1253    6   800       15       10
## 1018    6    50      300        5
## 895    10   500       20        5
## 1078    6   800      500        5
## 1665   10   500       20       20
## 2063    6   900       20       30
## 2113    6   200       50       30
## 1971    3   200       10       30
## 1269    8   100       20       10
## 445    10   700       10        2
## 2511    3   800       50       40
## 1394    8   800      100       10

myauc=0

for(i in 1:num_trial){  
  #print(paste('starting iteration :',i))
  # uncomment the line above to keep track of progress
  params=my_params[i,]
  
  k=cvTuning(randomForest,
             store~.-Id, 
             data =store_train,
             tuning =params,
             folds = cvFolds(nrow(store_train), K=15, type ="random"),
             cost =mycost_auc, 
             seed =2,
             predictArgs = list(type="prob"))

score.this=k$cv[,2]
  
## It took almost hours to run because we are trying 2695 combinations 


if(score.this>myauc){
    #print(params)
    #uncomment the line above to keep track of progress
    myauc=score.this
    #print(myauc)
    #uncomment the line above to keep track of progress
    #print(myauc)
    best_params=params
  }
  #print('DONE')
}

myauc

## [1] 0.8184884

This is the tentative performance measure. The best paramters are,

best_params

##      mtry ntree maxnodes nodesize
## 1076    3   800      500        5

Lets use these to build our final model.

ci.rf.final=randomForest(store~.-Id,
                         mtry=best_params$mtry,
                         ntree=best_params$ntree,
                         maxnodes=best_params$maxnodes,
                         nodesize=best_params$nodesize,
                         data=store_train
)

test.score_final=predict(ci.rf.final,newdata=store_test, type="prob")[,2]
write.csv(test.score_final,'Neha_Raut_P2_part2.csv',row.names = F)

Retail Store Prediction_Random Forest with Parameter tunning