Credit Card Fraud

This documents some playing with credit card fraud

creditcard <- read.table('/home/steve/RstudioSandbox/data/creditcard.csv',sep=',',header=TRUE)
summary(creditcard)
##       Time              V1                  V2           
##  Min.   :     0   Min.   :-56.40751   Min.   :-72.71573  
##  1st Qu.: 54202   1st Qu.: -0.92037   1st Qu.: -0.59855  
##  Median : 84692   Median :  0.01811   Median :  0.06549  
##  Mean   : 94814   Mean   :  0.00000   Mean   :  0.00000  
##  3rd Qu.:139320   3rd Qu.:  1.31564   3rd Qu.:  0.80372  
##  Max.   :172792   Max.   :  2.45493   Max.   : 22.05773  
##        V3                 V4                 V5            
##  Min.   :-48.3256   Min.   :-5.68317   Min.   :-113.74331  
##  1st Qu.: -0.8904   1st Qu.:-0.84864   1st Qu.:  -0.69160  
##  Median :  0.1799   Median :-0.01985   Median :  -0.05434  
##  Mean   :  0.0000   Mean   : 0.00000   Mean   :   0.00000  
##  3rd Qu.:  1.0272   3rd Qu.: 0.74334   3rd Qu.:   0.61193  
##  Max.   :  9.3826   Max.   :16.87534   Max.   :  34.80167  
##        V6                 V7                 V8           
##  Min.   :-26.1605   Min.   :-43.5572   Min.   :-73.21672  
##  1st Qu.: -0.7683   1st Qu.: -0.5541   1st Qu.: -0.20863  
##  Median : -0.2742   Median :  0.0401   Median :  0.02236  
##  Mean   :  0.0000   Mean   :  0.0000   Mean   :  0.00000  
##  3rd Qu.:  0.3986   3rd Qu.:  0.5704   3rd Qu.:  0.32735  
##  Max.   : 73.3016   Max.   :120.5895   Max.   : 20.00721  
##        V9                 V10                 V11          
##  Min.   :-13.43407   Min.   :-24.58826   Min.   :-4.79747  
##  1st Qu.: -0.64310   1st Qu.: -0.53543   1st Qu.:-0.76249  
##  Median : -0.05143   Median : -0.09292   Median :-0.03276  
##  Mean   :  0.00000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.:  0.59714   3rd Qu.:  0.45392   3rd Qu.: 0.73959  
##  Max.   : 15.59500   Max.   : 23.74514   Max.   :12.01891  
##       V12                V13                V14          
##  Min.   :-18.6837   Min.   :-5.79188   Min.   :-19.2143  
##  1st Qu.: -0.4056   1st Qu.:-0.64854   1st Qu.: -0.4256  
##  Median :  0.1400   Median :-0.01357   Median :  0.0506  
##  Mean   :  0.0000   Mean   : 0.00000   Mean   :  0.0000  
##  3rd Qu.:  0.6182   3rd Qu.: 0.66251   3rd Qu.:  0.4931  
##  Max.   :  7.8484   Max.   : 7.12688   Max.   : 10.5268  
##       V15                V16                 V17           
##  Min.   :-4.49894   Min.   :-14.12985   Min.   :-25.16280  
##  1st Qu.:-0.58288   1st Qu.: -0.46804   1st Qu.: -0.48375  
##  Median : 0.04807   Median :  0.06641   Median : -0.06568  
##  Mean   : 0.00000   Mean   :  0.00000   Mean   :  0.00000  
##  3rd Qu.: 0.64882   3rd Qu.:  0.52330   3rd Qu.:  0.39968  
##  Max.   : 8.87774   Max.   : 17.31511   Max.   :  9.25353  
##       V18                 V19                 V20           
##  Min.   :-9.498746   Min.   :-7.213527   Min.   :-54.49772  
##  1st Qu.:-0.498850   1st Qu.:-0.456299   1st Qu.: -0.21172  
##  Median :-0.003636   Median : 0.003735   Median : -0.06248  
##  Mean   : 0.000000   Mean   : 0.000000   Mean   :  0.00000  
##  3rd Qu.: 0.500807   3rd Qu.: 0.458949   3rd Qu.:  0.13304  
##  Max.   : 5.041069   Max.   : 5.591971   Max.   : 39.42090  
##       V21                 V22                  V23           
##  Min.   :-34.83038   Min.   :-10.933144   Min.   :-44.80774  
##  1st Qu.: -0.22839   1st Qu.: -0.542350   1st Qu.: -0.16185  
##  Median : -0.02945   Median :  0.006782   Median : -0.01119  
##  Mean   :  0.00000   Mean   :  0.000000   Mean   :  0.00000  
##  3rd Qu.:  0.18638   3rd Qu.:  0.528554   3rd Qu.:  0.14764  
##  Max.   : 27.20284   Max.   : 10.503090   Max.   : 22.52841  
##       V24                V25                 V26          
##  Min.   :-2.83663   Min.   :-10.29540   Min.   :-2.60455  
##  1st Qu.:-0.35459   1st Qu.: -0.31715   1st Qu.:-0.32698  
##  Median : 0.04098   Median :  0.01659   Median :-0.05214  
##  Mean   : 0.00000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.43953   3rd Qu.:  0.35072   3rd Qu.: 0.24095  
##  Max.   : 4.58455   Max.   :  7.51959   Max.   : 3.51735  
##       V27                  V28                Amount        
##  Min.   :-22.565679   Min.   :-15.43008   Min.   :    0.00  
##  1st Qu.: -0.070840   1st Qu.: -0.05296   1st Qu.:    5.60  
##  Median :  0.001342   Median :  0.01124   Median :   22.00  
##  Mean   :  0.000000   Mean   :  0.00000   Mean   :   88.35  
##  3rd Qu.:  0.091045   3rd Qu.:  0.07828   3rd Qu.:   77.17  
##  Max.   : 31.612198   Max.   : 33.84781   Max.   :25691.16  
##      Class         
##  Min.   :0.000000  
##  1st Qu.:0.000000  
##  Median :0.000000  
##  Mean   :0.001728  
##  3rd Qu.:0.000000  
##  Max.   :1.000000

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(tidyr)
library(ggridges)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
creditcard %>%
  gather(variable, value, -Class) %>% 
  ggplot(aes(y=as.factor(variable),fill=as.factor(Class),
             x= percent_rank(value))) +
  geom_density_ridges()
## Picking joint bandwidth of 0.0309

library(dplyr)
library(purrr)
norm <- creditcard %>% select(-Time,-Class) %>% map(~list(mean=mean(.x),min=min(.x),max=max(.x)))
train <- creditcard %>% select(-Time,-Class) %>% map2_df(norm,~((.x - .y$min)/(.y$max-.y$min))) %>% as.matrix()
library(keras)
ncolumns = ncol(train)
print(ncol(train))
## [1] 29
model <- keras_model_sequential() %>% 
  layer_dense(units = 29, input_shape=ncol(train)) %>% 
  layer_dense(units=10, activation="tanh") %>%
  layer_dense(units=15, activation="tanh") %>%
  layer_dense(units = ncol(train))

summary(model)
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## dense_1 (Dense)                  (None, 29)                    870         
## ___________________________________________________________________________
## dense_2 (Dense)                  (None, 10)                    300         
## ___________________________________________________________________________
## dense_3 (Dense)                  (None, 15)                    165         
## ___________________________________________________________________________
## dense_4 (Dense)                  (None, 29)                    464         
## ===========================================================================
## Total params: 1,799
## Trainable params: 1,799
## Non-trainable params: 0
## ___________________________________________________________________________
model %>% keras::compile(optimizer="adam",loss="mean_squared_error")
y_train <- creditcard$Class
model_save <- callback_model_checkpoint(filepath='weights.{epoch:02}-{val_loss:.2}.hdf5',save_best_only=TRUE,verbose=2)
model %>% 
  fit(x=train[y_train==0,],y=train[y_train==0,],epochs=50,batch_size=32, validation_data=list(train[y_train==0,],train[y_train==0,]),
                                                                                                       callbacks = list(model_save,callback_early_stopping(patience=5)))
p<-predict(model,train)
mse_train <- apply((train - p)^2,1.,sum)
library(Metrics)
auc(train,mse_train)
## [1] NA