This documents some playing with credit card fraud
creditcard <- read.table('/home/steve/RstudioSandbox/data/creditcard.csv',sep=',',header=TRUE)
summary(creditcard)
## Time V1 V2
## Min. : 0 Min. :-56.40751 Min. :-72.71573
## 1st Qu.: 54202 1st Qu.: -0.92037 1st Qu.: -0.59855
## Median : 84692 Median : 0.01811 Median : 0.06549
## Mean : 94814 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.:139320 3rd Qu.: 1.31564 3rd Qu.: 0.80372
## Max. :172792 Max. : 2.45493 Max. : 22.05773
## V3 V4 V5
## Min. :-48.3256 Min. :-5.68317 Min. :-113.74331
## 1st Qu.: -0.8904 1st Qu.:-0.84864 1st Qu.: -0.69160
## Median : 0.1799 Median :-0.01985 Median : -0.05434
## Mean : 0.0000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 1.0272 3rd Qu.: 0.74334 3rd Qu.: 0.61193
## Max. : 9.3826 Max. :16.87534 Max. : 34.80167
## V6 V7 V8
## Min. :-26.1605 Min. :-43.5572 Min. :-73.21672
## 1st Qu.: -0.7683 1st Qu.: -0.5541 1st Qu.: -0.20863
## Median : -0.2742 Median : 0.0401 Median : 0.02236
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.3986 3rd Qu.: 0.5704 3rd Qu.: 0.32735
## Max. : 73.3016 Max. :120.5895 Max. : 20.00721
## V9 V10 V11
## Min. :-13.43407 Min. :-24.58826 Min. :-4.79747
## 1st Qu.: -0.64310 1st Qu.: -0.53543 1st Qu.:-0.76249
## Median : -0.05143 Median : -0.09292 Median :-0.03276
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.59714 3rd Qu.: 0.45392 3rd Qu.: 0.73959
## Max. : 15.59500 Max. : 23.74514 Max. :12.01891
## V12 V13 V14
## Min. :-18.6837 Min. :-5.79188 Min. :-19.2143
## 1st Qu.: -0.4056 1st Qu.:-0.64854 1st Qu.: -0.4256
## Median : 0.1400 Median :-0.01357 Median : 0.0506
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.6182 3rd Qu.: 0.66251 3rd Qu.: 0.4931
## Max. : 7.8484 Max. : 7.12688 Max. : 10.5268
## V15 V16 V17
## Min. :-4.49894 Min. :-14.12985 Min. :-25.16280
## 1st Qu.:-0.58288 1st Qu.: -0.46804 1st Qu.: -0.48375
## Median : 0.04807 Median : 0.06641 Median : -0.06568
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.64882 3rd Qu.: 0.52330 3rd Qu.: 0.39968
## Max. : 8.87774 Max. : 17.31511 Max. : 9.25353
## V18 V19 V20
## Min. :-9.498746 Min. :-7.213527 Min. :-54.49772
## 1st Qu.:-0.498850 1st Qu.:-0.456299 1st Qu.: -0.21172
## Median :-0.003636 Median : 0.003735 Median : -0.06248
## Mean : 0.000000 Mean : 0.000000 Mean : 0.00000
## 3rd Qu.: 0.500807 3rd Qu.: 0.458949 3rd Qu.: 0.13304
## Max. : 5.041069 Max. : 5.591971 Max. : 39.42090
## V21 V22 V23
## Min. :-34.83038 Min. :-10.933144 Min. :-44.80774
## 1st Qu.: -0.22839 1st Qu.: -0.542350 1st Qu.: -0.16185
## Median : -0.02945 Median : 0.006782 Median : -0.01119
## Mean : 0.00000 Mean : 0.000000 Mean : 0.00000
## 3rd Qu.: 0.18638 3rd Qu.: 0.528554 3rd Qu.: 0.14764
## Max. : 27.20284 Max. : 10.503090 Max. : 22.52841
## V24 V25 V26
## Min. :-2.83663 Min. :-10.29540 Min. :-2.60455
## 1st Qu.:-0.35459 1st Qu.: -0.31715 1st Qu.:-0.32698
## Median : 0.04098 Median : 0.01659 Median :-0.05214
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.43953 3rd Qu.: 0.35072 3rd Qu.: 0.24095
## Max. : 4.58455 Max. : 7.51959 Max. : 3.51735
## V27 V28 Amount
## Min. :-22.565679 Min. :-15.43008 Min. : 0.00
## 1st Qu.: -0.070840 1st Qu.: -0.05296 1st Qu.: 5.60
## Median : 0.001342 Median : 0.01124 Median : 22.00
## Mean : 0.000000 Mean : 0.00000 Mean : 88.35
## 3rd Qu.: 0.091045 3rd Qu.: 0.07828 3rd Qu.: 77.17
## Max. : 31.612198 Max. : 33.84781 Max. :25691.16
## Class
## Min. :0.000000
## 1st Qu.:0.000000
## Median :0.000000
## Mean :0.001728
## 3rd Qu.:0.000000
## Max. :1.000000
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
library(tidyr)
library(ggridges)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
creditcard %>%
gather(variable, value, -Class) %>%
ggplot(aes(y=as.factor(variable),fill=as.factor(Class),
x= percent_rank(value))) +
geom_density_ridges()
## Picking joint bandwidth of 0.0309
library(dplyr)
library(purrr)
norm <- creditcard %>% select(-Time,-Class) %>% map(~list(mean=mean(.x),min=min(.x),max=max(.x)))
train <- creditcard %>% select(-Time,-Class) %>% map2_df(norm,~((.x - .y$min)/(.y$max-.y$min))) %>% as.matrix()
library(keras)
ncolumns = ncol(train)
print(ncol(train))
## [1] 29
model <- keras_model_sequential() %>%
layer_dense(units = 29, input_shape=ncol(train)) %>%
layer_dense(units=10, activation="tanh") %>%
layer_dense(units=15, activation="tanh") %>%
layer_dense(units = ncol(train))
summary(model)
## ___________________________________________________________________________
## Layer (type) Output Shape Param #
## ===========================================================================
## dense_1 (Dense) (None, 29) 870
## ___________________________________________________________________________
## dense_2 (Dense) (None, 10) 300
## ___________________________________________________________________________
## dense_3 (Dense) (None, 15) 165
## ___________________________________________________________________________
## dense_4 (Dense) (None, 29) 464
## ===========================================================================
## Total params: 1,799
## Trainable params: 1,799
## Non-trainable params: 0
## ___________________________________________________________________________
model %>% keras::compile(optimizer="adam",loss="mean_squared_error")
y_train <- creditcard$Class
model_save <- callback_model_checkpoint(filepath='weights.{epoch:02}-{val_loss:.2}.hdf5',save_best_only=TRUE,verbose=2)
model %>%
fit(x=train[y_train==0,],y=train[y_train==0,],epochs=50,batch_size=32, validation_data=list(train[y_train==0,],train[y_train==0,]),
callbacks = list(model_save,callback_early_stopping(patience=5)))
p<-predict(model,train)
mse_train <- apply((train - p)^2,1.,sum)
library(Metrics)
auc(train,mse_train)
## [1] NA