Lekshman Ramesh
od=read.csv("data.csv", header=T)
str(od)## 'data.frame': 20002 obs. of 7 variables:
## $ Banker : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Accounts : num 82.5 104.6 110.6 90.8 45.4 ...
## $ Maximum.Spike: num 2.28 5.18 0.59 2.63 4.46 ...
## $ Inactivity : num 0.07 0.24 0.21 0.1 0.2 0.3 0.07 0.02 0.27 0.35 ...
## $ Early.Closure: num 0.49 0.07 0.45 0.49 0.11 0.18 0.26 0.23 0.28 0.05 ...
## $ Charge.Off : num 0.15 0.09 0.09 0.19 0.05 0 0.12 0.07 0.01 0.13 ...
## $ Fraud : num 0.09 0.2 0.09 0.09 0.19 0.13 0.06 0.18 0.09 0.17 ...
od2=as.data.frame(scale(od))
summary(od2)## Banker Accounts Maximum.Spike
## Min. :-1.732 Min. :-1.74572 Min. :-1.731514
## 1st Qu.:-0.866 1st Qu.:-0.87048 1st Qu.:-0.873454
## Median : 0.000 Median : 0.01379 Median : 0.000299
## Mean : 0.000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.866 3rd Qu.: 0.86836 3rd Qu.: 0.869489
## Max. : 1.732 Max. : 1.71819 Max. : 4.652904
## Inactivity Early.Closure Charge.Off
## Min. :-1.735546 Min. :-1.72497 Min. :-1.725586
## 1st Qu.:-0.834225 1st Qu.:-0.84720 1st Qu.:-0.864403
## Median :-0.002236 Median : 0.03058 Median :-0.003221
## Mean : 0.000000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.829753 3rd Qu.: 0.84565 3rd Qu.: 0.857962
## Max. : 3.325720 Max. : 2.91469 Max. : 5.680586
## Fraud
## Min. :-1.703520
## 1st Qu.:-0.846951
## Median : 0.009618
## Mean : 0.000000
## 3rd Qu.: 0.866187
## Max. : 6.690856
od2=as.data.frame(sapply(od2,function(x){ifelse(x<0,0,x)}))
od2$ed_overall=od2[,2]^2+od2[,3]^2+od2[,4]^2+od2[,5]^2+od2[,6]^2+od2[,7]^2
#Ignored the banker key od[,1]
summary(od2)## Banker Accounts Maximum.Spike Inactivity
## Min. :0.0000000 Min. :0.00000 Min. :0.000000 Min. :0.0000
## 1st Qu.:0.0000000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000
## Median :0.0000433 Median :0.01379 Median :0.000299 Median :0.0000
## Mean :0.4330019 Mean :0.43323 Mean :0.432576 Mean :0.4327
## 3rd Qu.:0.8659605 3rd Qu.:0.86836 3rd Qu.:0.869489 3rd Qu.:0.8298
## Max. :1.7319209 Max. :1.71819 Max. :4.652904 Max. :3.3257
## Early.Closure Charge.Off Fraud ed_overall
## Min. :0.00000 Min. :0.0000 Min. :0.000000 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.000000 1st Qu.: 1.476
## Median :0.03058 Median :0.0000 Median :0.009618 Median : 2.762
## Mean :0.43308 Mean :0.4322 Mean :0.432536 Mean : 2.997
## 3rd Qu.:0.84565 3rd Qu.:0.8580 3rd Qu.:0.866187 3rd Qu.: 4.262
## Max. :2.91469 Max. :5.6806 Max. :6.690856 Max. :45.543
library(ggplot2)
ggplot(data = od2, aes(x=1,y=ed_overall))+
geom_boxplot(outlier.colour = "red", outlier.size = 3,fill="lightblue", color="darkblue")+
theme(axis.title.x = element_blank(), axis.text.x = element_blank())ed_overall=od2$ed_overall
od3=cbind(od,ed_overall)
od3=od3[order(od3$ed_overall, decreasing = T),]
head(od3)## Banker Accounts Maximum.Spike Inactivity Early.Closure Charge.Off
## 14781 14781 30.98339 1.279827 0.33 0.18 0.14
## 5634 5634 53.72716 4.046279 0.15 0.29 0.43
## 9 9 102.35200 9.745000 0.27 0.28 0.01
## 11833 11833 69.84385 2.672617 0.73 0.44 0.15
## 11476 11476 121.52178 5.421197 0.50 0.48 0.16
## 102 102 108.83688 5.404849 0.47 0.53 0.16
## Fraud ed_overall
## 14781 0.49 45.54294
## 5634 0.10 32.79255
## 9 0.09 22.54170
## 11833 0.09 12.86515
## 11476 0.16 12.11792
## 102 0.18 11.92362