Santander Customer Satisfaction : Data Wrangling

Lekshman Ramesh

What Is It About?

Objective

Excluding Identical Features

RemoveIdenticalCol=function(train){
rep_names1=NULL
for (i in (1:(ncol(train)-1)))
{   for (j in ((i+1):ncol(train)))
    {       if (identical(train[[i]],train[[j]]))
        {           rep_names1=c(rep_names1,names(train)[i])
        }
    }
} 
remove_list=unique(rep_names1)
train=train[,!names(train) %in% remove_list]
return(train)
}

Step 1- Variable Reduction

train=RemoveIdenticalCol(train)
dim(train)
## [1] 76020   309

Excluding Low Variance Variables

repvalues=function(x){
     check=table(x, useNA = "always")
     check2=check[which.max(check)]
     return (cbind(check2,len=length(x)))
}
useless_var=function(x,n){
     mode_count=sapply(x,function(y)repvalues(y))
     red_prop=mode_count[1,]/mode_count[2,]
     red_prop=red_prop[red_prop<=n]
     return (red_prop)
 }

Step 2 - Variable Reduction

red_prop=useless_var(train,0.995)
train=subset(train, select=names(red_prop))
dim(train)
## [1] 76020   157

Up Next