df_train = read.csv('/Users/suyogkan/Desktop/Cap/Credit-Card-Fraud-Detection-Capstone-Project/fraudTrain.csv')
head(df_train)
##   X trans_date_trans_time       cc_num                           merchant
## 1 0   2019-01-01 00:00:18 2.703186e+15         fraud_Rippin, Kub and Mann
## 2 1   2019-01-01 00:00:44 6.304233e+11    fraud_Heller, Gutmann and Zieme
## 3 2   2019-01-01 00:00:51 3.885949e+13               fraud_Lind-Buckridge
## 4 3   2019-01-01 00:01:16 3.534094e+15 fraud_Kutch, Hermiston and Farrell
## 5 4   2019-01-01 00:03:06 3.755342e+14                fraud_Keeling-Crist
## 6 5   2019-01-01 00:04:08 4.767265e+15   fraud_Stroman, Hudson and Erdman
##        category    amt     first    last gender                       street
## 1      misc_net   4.97  Jennifer   Banks      F               561 Perry Cove
## 2   grocery_pos 107.23 Stephanie    Gill      F 43039 Riley Greens Suite 393
## 3 entertainment 220.11    Edward Sanchez      M     594 White Dale Suite 530
## 4 gas_transport  45.00    Jeremy   White      M  9443 Cynthia Court Apt. 038
## 5      misc_pos  41.96     Tyler  Garcia      M             408 Bradley Rest
## 6 gas_transport  94.63  Jennifer  Conner      F            4655 David Island
##             city state   zip     lat      long city_pop
## 1 Moravian Falls    NC 28654 36.0788  -81.1781     3495
## 2         Orient    WA 99160 48.8878 -118.2105      149
## 3     Malad City    ID 83252 42.1808 -112.2620     4154
## 4        Boulder    MT 59632 46.2306 -112.1138     1939
## 5       Doe Hill    VA 24433 38.4207  -79.4629       99
## 6         Dublin    PA 18917 40.3750  -75.2045     2158
##                                 job        dob                        trans_num
## 1         Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9
## 2 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99
## 3       Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95
## 4                   Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81
## 5    Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46
## 6                 Transport planner 1961-06-19 189a841a0a8ba03058526bcfe566aab5
##    unix_time merch_lat merch_long is_fraud
## 1 1325376018  36.01129  -82.04832        0
## 2 1325376044  49.15905 -118.18646        0
## 3 1325376051  43.15070 -112.15448        0
## 4 1325376076  47.03433 -112.56107        0
## 5 1325376186  38.67500  -78.63246        0
## 6 1325376248  40.65338  -76.15267        0
str(df_train)
## 'data.frame':    1296675 obs. of  23 variables:
##  $ X                    : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ trans_date_trans_time: chr  "2019-01-01 00:00:18" "2019-01-01 00:00:44" "2019-01-01 00:00:51" "2019-01-01 00:01:16" ...
##  $ cc_num               : num  2.70e+15 6.30e+11 3.89e+13 3.53e+15 3.76e+14 ...
##  $ merchant             : chr  "fraud_Rippin, Kub and Mann" "fraud_Heller, Gutmann and Zieme" "fraud_Lind-Buckridge" "fraud_Kutch, Hermiston and Farrell" ...
##  $ category             : chr  "misc_net" "grocery_pos" "entertainment" "gas_transport" ...
##  $ amt                  : num  4.97 107.23 220.11 45 41.96 ...
##  $ first                : chr  "Jennifer" "Stephanie" "Edward" "Jeremy" ...
##  $ last                 : chr  "Banks" "Gill" "Sanchez" "White" ...
##  $ gender               : chr  "F" "F" "M" "M" ...
##  $ street               : chr  "561 Perry Cove" "43039 Riley Greens Suite 393" "594 White Dale Suite 530" "9443 Cynthia Court Apt. 038" ...
##  $ city                 : chr  "Moravian Falls" "Orient" "Malad City" "Boulder" ...
##  $ state                : chr  "NC" "WA" "ID" "MT" ...
##  $ zip                  : int  28654 99160 83252 59632 24433 18917 67851 22824 15665 37040 ...
##  $ lat                  : num  36.1 48.9 42.2 46.2 38.4 ...
##  $ long                 : num  -81.2 -118.2 -112.3 -112.1 -79.5 ...
##  $ city_pop             : int  3495 149 4154 1939 99 2158 2691 6018 1472 151785 ...
##  $ job                  : chr  "Psychologist, counselling" "Special educational needs teacher" "Nature conservation officer" "Patent attorney" ...
##  $ dob                  : chr  "1988-03-09" "1978-06-21" "1962-01-19" "1967-01-12" ...
##  $ trans_num            : chr  "0b242abb623afc578575680df30655b9" "1f76529f8574734946361c461b024d99" "a1a22d70485983eac12b5b88dad1cf95" "6b849c168bdad6f867558c3793159a81" ...
##  $ unix_time            : int  1325376018 1325376044 1325376051 1325376076 1325376186 1325376248 1325376282 1325376308 1325376318 1325376361 ...
##  $ merch_lat            : num  36 49.2 43.2 47 38.7 ...
##  $ merch_long           : num  -82 -118.2 -112.2 -112.6 -78.6 ...
##  $ is_fraud             : int  0 0 0 0 0 0 0 0 0 0 ...
summary(df_train)
##        X           trans_date_trans_time     cc_num            merchant        
##  Min.   :      0   Length:1296675        Min.   :6.042e+10   Length:1296675    
##  1st Qu.: 324168   Class :character      1st Qu.:1.800e+14   Class :character  
##  Median : 648337   Mode  :character      Median :3.521e+15   Mode  :character  
##  Mean   : 648337                         Mean   :4.172e+17                     
##  3rd Qu.: 972506                         3rd Qu.:4.642e+15                     
##  Max.   :1296674                         Max.   :4.992e+18                     
##    category              amt              first               last          
##  Length:1296675     Min.   :    1.00   Length:1296675     Length:1296675    
##  Class :character   1st Qu.:    9.65   Class :character   Class :character  
##  Mode  :character   Median :   47.52   Mode  :character   Mode  :character  
##                     Mean   :   70.35                                        
##                     3rd Qu.:   83.14                                        
##                     Max.   :28948.90                                        
##     gender             street              city              state          
##  Length:1296675     Length:1296675     Length:1296675     Length:1296675    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       zip             lat             long            city_pop      
##  Min.   : 1257   Min.   :20.03   Min.   :-165.67   Min.   :     23  
##  1st Qu.:26237   1st Qu.:34.62   1st Qu.: -96.80   1st Qu.:    743  
##  Median :48174   Median :39.35   Median : -87.48   Median :   2456  
##  Mean   :48801   Mean   :38.54   Mean   : -90.23   Mean   :  88824  
##  3rd Qu.:72042   3rd Qu.:41.94   3rd Qu.: -80.16   3rd Qu.:  20328  
##  Max.   :99783   Max.   :66.69   Max.   : -67.95   Max.   :2906700  
##      job                dob             trans_num           unix_time        
##  Length:1296675     Length:1296675     Length:1296675     Min.   :1.325e+09  
##  Class :character   Class :character   Class :character   1st Qu.:1.339e+09  
##  Mode  :character   Mode  :character   Mode  :character   Median :1.349e+09  
##                                                           Mean   :1.349e+09  
##                                                           3rd Qu.:1.359e+09  
##                                                           Max.   :1.372e+09  
##    merch_lat       merch_long         is_fraud       
##  Min.   :19.03   Min.   :-166.67   Min.   :0.000000  
##  1st Qu.:34.73   1st Qu.: -96.90   1st Qu.:0.000000  
##  Median :39.37   Median : -87.44   Median :0.000000  
##  Mean   :38.54   Mean   : -90.23   Mean   :0.005789  
##  3rd Qu.:41.96   3rd Qu.: -80.24   3rd Qu.:0.000000  
##  Max.   :67.51   Max.   : -66.95   Max.   :1.000000
df_test = read.csv('/Users/suyogkan/Desktop/Cap/Credit-Card-Fraud-Detection-Capstone-Project/fraudTest.csv')
head(df_test)
##   X trans_date_trans_time       cc_num                             merchant
## 1 0   2020-06-21 12:14:25 2.291164e+15                fraud_Kirlin and Sons
## 2 1   2020-06-21 12:14:33 3.573030e+15                 fraud_Sporer-Keebler
## 3 2   2020-06-21 12:14:53 3.598215e+15 fraud_Swaniawski, Nitzsche and Welch
## 4 3   2020-06-21 12:15:15 3.591920e+15                    fraud_Haley Group
## 5 4   2020-06-21 12:15:17 3.526826e+15                fraud_Johnston-Casper
## 6 5   2020-06-21 12:15:37 3.040768e+13                  fraud_Daugherty LLC
##         category   amt    first     last gender                      street
## 1  personal_care  2.86     Jeff  Elliott      M           351 Darlene Green
## 2  personal_care 29.84   Joanne Williams      F            3638 Marsh Union
## 3 health_fitness 41.28   Ashley    Lopez      F        9333 Valentine Point
## 4       misc_pos 60.05    Brian Williams      M 32941 Krystal Mill Apt. 552
## 5         travel  3.19   Nathan   Massey      M    5783 Evan Roads Apt. 465
## 6      kids_pets 19.55 Danielle    Evans      F  76752 David Lodge Apt. 064
##         city state   zip     lat      long city_pop                    job
## 1   Columbia    SC 29209 33.9659  -80.9355   333497    Mechanical engineer
## 2    Altonah    UT 84002 40.3207 -110.4360      302 Sales professional, IT
## 3   Bellmore    NY 11710 40.6729  -73.5365    34496      Librarian, public
## 4 Titusville    FL 32780 28.5697  -80.8191    54767           Set designer
## 5   Falmouth    MI 49632 44.2529  -85.0170     1126     Furniture designer
## 6  Breesport    NY 14816 42.1939  -76.7361      520        Psychotherapist
##          dob                        trans_num  unix_time merch_lat merch_long
## 1 1968-03-19 2da90c7d74bd46a0caf3777415b3ebd3 1371816865  33.98639  -81.20071
## 2 1990-01-17 324cc204407e99f51b0d6ca0055005e7 1371816873  39.45050 -109.96043
## 3 1970-10-21 c81755dbbbea9d5c77f094348a7579be 1371816893  40.49581  -74.19611
## 4 1987-07-25 2159175b9efe66dc301f149d3d5abf8c 1371816915  28.81240  -80.88306
## 5 1955-07-06 57ff021bd3f328f8738bb535c302a31b 1371816917  44.95915  -85.88473
## 6 1991-10-13 798db04aaceb4febd084f1a7c404da93 1371816937  41.74716  -77.58420
##   is_fraud
## 1        0
## 2        0
## 3        0
## 4        0
## 5        0
## 6        0
str(df_test)
## 'data.frame':    555719 obs. of  23 variables:
##  $ X                    : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ trans_date_trans_time: chr  "2020-06-21 12:14:25" "2020-06-21 12:14:33" "2020-06-21 12:14:53" "2020-06-21 12:15:15" ...
##  $ cc_num               : num  2.29e+15 3.57e+15 3.60e+15 3.59e+15 3.53e+15 ...
##  $ merchant             : chr  "fraud_Kirlin and Sons" "fraud_Sporer-Keebler" "fraud_Swaniawski, Nitzsche and Welch" "fraud_Haley Group" ...
##  $ category             : chr  "personal_care" "personal_care" "health_fitness" "misc_pos" ...
##  $ amt                  : num  2.86 29.84 41.28 60.05 3.19 ...
##  $ first                : chr  "Jeff" "Joanne" "Ashley" "Brian" ...
##  $ last                 : chr  "Elliott" "Williams" "Lopez" "Williams" ...
##  $ gender               : chr  "M" "F" "F" "M" ...
##  $ street               : chr  "351 Darlene Green" "3638 Marsh Union" "9333 Valentine Point" "32941 Krystal Mill Apt. 552" ...
##  $ city                 : chr  "Columbia" "Altonah" "Bellmore" "Titusville" ...
##  $ state                : chr  "SC" "UT" "NY" "FL" ...
##  $ zip                  : int  29209 84002 11710 32780 49632 14816 95528 57374 16858 76678 ...
##  $ lat                  : num  34 40.3 40.7 28.6 44.3 ...
##  $ long                 : num  -80.9 -110.4 -73.5 -80.8 -85 ...
##  $ city_pop             : int  333497 302 34496 54767 1126 520 1139 343 3688 263 ...
##  $ job                  : chr  "Mechanical engineer" "Sales professional, IT" "Librarian, public" "Set designer" ...
##  $ dob                  : chr  "1968-03-19" "1990-01-17" "1970-10-21" "1987-07-25" ...
##  $ trans_num            : chr  "2da90c7d74bd46a0caf3777415b3ebd3" "324cc204407e99f51b0d6ca0055005e7" "c81755dbbbea9d5c77f094348a7579be" "2159175b9efe66dc301f149d3d5abf8c" ...
##  $ unix_time            : int  1371816865 1371816873 1371816893 1371816915 1371816917 1371816937 1371816944 1371816950 1371816970 1371816971 ...
##  $ merch_lat            : num  34 39.5 40.5 28.8 45 ...
##  $ merch_long           : num  -81.2 -110 -74.2 -80.9 -85.9 ...
##  $ is_fraud             : int  0 0 0 0 0 0 0 0 0 0 ...
summary(df_test)
##        X          trans_date_trans_time     cc_num            merchant        
##  Min.   :     0   Length:555719         Min.   :6.042e+10   Length:555719     
##  1st Qu.:138930   Class :character      1st Qu.:1.800e+14   Class :character  
##  Median :277859   Mode  :character      Median :3.521e+15   Mode  :character  
##  Mean   :277859                         Mean   :4.178e+17                     
##  3rd Qu.:416788                         3rd Qu.:4.635e+15                     
##  Max.   :555718                         Max.   :4.992e+18                     
##    category              amt              first               last          
##  Length:555719      Min.   :    1.00   Length:555719      Length:555719     
##  Class :character   1st Qu.:    9.63   Class :character   Class :character  
##  Mode  :character   Median :   47.29   Mode  :character   Mode  :character  
##                     Mean   :   69.39                                        
##                     3rd Qu.:   83.01                                        
##                     Max.   :22768.11                                        
##     gender             street              city              state          
##  Length:555719      Length:555719      Length:555719      Length:555719     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       zip             lat             long            city_pop      
##  Min.   : 1257   Min.   :20.03   Min.   :-165.67   Min.   :     23  
##  1st Qu.:26292   1st Qu.:34.67   1st Qu.: -96.80   1st Qu.:    741  
##  Median :48174   Median :39.37   Median : -87.48   Median :   2408  
##  Mean   :48843   Mean   :38.54   Mean   : -90.23   Mean   :  88222  
##  3rd Qu.:72011   3rd Qu.:41.89   3rd Qu.: -80.18   3rd Qu.:  19685  
##  Max.   :99921   Max.   :65.69   Max.   : -67.95   Max.   :2906700  
##      job                dob             trans_num           unix_time        
##  Length:555719      Length:555719      Length:555719      Min.   :1.372e+09  
##  Class :character   Class :character   Class :character   1st Qu.:1.376e+09  
##  Mode  :character   Mode  :character   Mode  :character   Median :1.381e+09  
##                                                           Mean   :1.381e+09  
##                                                           3rd Qu.:1.386e+09  
##                                                           Max.   :1.389e+09  
##    merch_lat       merch_long         is_fraud      
##  Min.   :19.03   Min.   :-166.67   Min.   :0.00000  
##  1st Qu.:34.76   1st Qu.: -96.91   1st Qu.:0.00000  
##  Median :39.38   Median : -87.45   Median :0.00000  
##  Mean   :38.54   Mean   : -90.23   Mean   :0.00386  
##  3rd Qu.:41.95   3rd Qu.: -80.26   3rd Qu.:0.00000  
##  Max.   :66.68   Max.   : -66.95   Max.   :1.00000
df_train$trans_date <- as.POSIXct( df_train$trans_date_trans_time, format="%Y-%m-%d" )
df_train$dob <- as.POSIXct( df_train$dob, format="%Y-%m-%d" )
head(df_train)
##   X trans_date_trans_time       cc_num                           merchant
## 1 0   2019-01-01 00:00:18 2.703186e+15         fraud_Rippin, Kub and Mann
## 2 1   2019-01-01 00:00:44 6.304233e+11    fraud_Heller, Gutmann and Zieme
## 3 2   2019-01-01 00:00:51 3.885949e+13               fraud_Lind-Buckridge
## 4 3   2019-01-01 00:01:16 3.534094e+15 fraud_Kutch, Hermiston and Farrell
## 5 4   2019-01-01 00:03:06 3.755342e+14                fraud_Keeling-Crist
## 6 5   2019-01-01 00:04:08 4.767265e+15   fraud_Stroman, Hudson and Erdman
##        category    amt     first    last gender                       street
## 1      misc_net   4.97  Jennifer   Banks      F               561 Perry Cove
## 2   grocery_pos 107.23 Stephanie    Gill      F 43039 Riley Greens Suite 393
## 3 entertainment 220.11    Edward Sanchez      M     594 White Dale Suite 530
## 4 gas_transport  45.00    Jeremy   White      M  9443 Cynthia Court Apt. 038
## 5      misc_pos  41.96     Tyler  Garcia      M             408 Bradley Rest
## 6 gas_transport  94.63  Jennifer  Conner      F            4655 David Island
##             city state   zip     lat      long city_pop
## 1 Moravian Falls    NC 28654 36.0788  -81.1781     3495
## 2         Orient    WA 99160 48.8878 -118.2105      149
## 3     Malad City    ID 83252 42.1808 -112.2620     4154
## 4        Boulder    MT 59632 46.2306 -112.1138     1939
## 5       Doe Hill    VA 24433 38.4207  -79.4629       99
## 6         Dublin    PA 18917 40.3750  -75.2045     2158
##                                 job        dob                        trans_num
## 1         Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9
## 2 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99
## 3       Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95
## 4                   Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81
## 5    Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46
## 6                 Transport planner 1961-06-19 189a841a0a8ba03058526bcfe566aab5
##    unix_time merch_lat merch_long is_fraud trans_date
## 1 1325376018  36.01129  -82.04832        0 2019-01-01
## 2 1325376044  49.15905 -118.18646        0 2019-01-01
## 3 1325376051  43.15070 -112.15448        0 2019-01-01
## 4 1325376076  47.03433 -112.56107        0 2019-01-01
## 5 1325376186  38.67500  -78.63246        0 2019-01-01
## 6 1325376248  40.65338  -76.15267        0 2019-01-01
df_test$trans_date <- as.POSIXct( df_test$trans_date_trans_time, format="%Y-%m-%d" )
df_test$dob <- as.POSIXct( df_test$dob, format="%Y-%m-%d" )
head(df_test)
##   X trans_date_trans_time       cc_num                             merchant
## 1 0   2020-06-21 12:14:25 2.291164e+15                fraud_Kirlin and Sons
## 2 1   2020-06-21 12:14:33 3.573030e+15                 fraud_Sporer-Keebler
## 3 2   2020-06-21 12:14:53 3.598215e+15 fraud_Swaniawski, Nitzsche and Welch
## 4 3   2020-06-21 12:15:15 3.591920e+15                    fraud_Haley Group
## 5 4   2020-06-21 12:15:17 3.526826e+15                fraud_Johnston-Casper
## 6 5   2020-06-21 12:15:37 3.040768e+13                  fraud_Daugherty LLC
##         category   amt    first     last gender                      street
## 1  personal_care  2.86     Jeff  Elliott      M           351 Darlene Green
## 2  personal_care 29.84   Joanne Williams      F            3638 Marsh Union
## 3 health_fitness 41.28   Ashley    Lopez      F        9333 Valentine Point
## 4       misc_pos 60.05    Brian Williams      M 32941 Krystal Mill Apt. 552
## 5         travel  3.19   Nathan   Massey      M    5783 Evan Roads Apt. 465
## 6      kids_pets 19.55 Danielle    Evans      F  76752 David Lodge Apt. 064
##         city state   zip     lat      long city_pop                    job
## 1   Columbia    SC 29209 33.9659  -80.9355   333497    Mechanical engineer
## 2    Altonah    UT 84002 40.3207 -110.4360      302 Sales professional, IT
## 3   Bellmore    NY 11710 40.6729  -73.5365    34496      Librarian, public
## 4 Titusville    FL 32780 28.5697  -80.8191    54767           Set designer
## 5   Falmouth    MI 49632 44.2529  -85.0170     1126     Furniture designer
## 6  Breesport    NY 14816 42.1939  -76.7361      520        Psychotherapist
##          dob                        trans_num  unix_time merch_lat merch_long
## 1 1968-03-19 2da90c7d74bd46a0caf3777415b3ebd3 1371816865  33.98639  -81.20071
## 2 1990-01-17 324cc204407e99f51b0d6ca0055005e7 1371816873  39.45050 -109.96043
## 3 1970-10-21 c81755dbbbea9d5c77f094348a7579be 1371816893  40.49581  -74.19611
## 4 1987-07-25 2159175b9efe66dc301f149d3d5abf8c 1371816915  28.81240  -80.88306
## 5 1955-07-06 57ff021bd3f328f8738bb535c302a31b 1371816917  44.95915  -85.88473
## 6 1991-10-13 798db04aaceb4febd084f1a7c404da93 1371816937  41.74716  -77.58420
##   is_fraud trans_date
## 1        0 2020-06-21
## 2        0 2020-06-21
## 3        0 2020-06-21
## 4        0 2020-06-21
## 5        0 2020-06-21
## 6        0 2020-06-21
df_train1 <- df_train[,!names(df_train) %in% c("X",'cc_num','merchant','first','last','street','zip','trans_num','unix_time')]
head(df_train1)
##   trans_date_trans_time      category    amt gender           city state
## 1   2019-01-01 00:00:18      misc_net   4.97      F Moravian Falls    NC
## 2   2019-01-01 00:00:44   grocery_pos 107.23      F         Orient    WA
## 3   2019-01-01 00:00:51 entertainment 220.11      M     Malad City    ID
## 4   2019-01-01 00:01:16 gas_transport  45.00      M        Boulder    MT
## 5   2019-01-01 00:03:06      misc_pos  41.96      M       Doe Hill    VA
## 6   2019-01-01 00:04:08 gas_transport  94.63      F         Dublin    PA
##       lat      long city_pop                               job        dob
## 1 36.0788  -81.1781     3495         Psychologist, counselling 1988-03-09
## 2 48.8878 -118.2105      149 Special educational needs teacher 1978-06-21
## 3 42.1808 -112.2620     4154       Nature conservation officer 1962-01-19
## 4 46.2306 -112.1138     1939                   Patent attorney 1967-01-12
## 5 38.4207  -79.4629       99    Dance movement psychotherapist 1986-03-28
## 6 40.3750  -75.2045     2158                 Transport planner 1961-06-19
##   merch_lat merch_long is_fraud trans_date
## 1  36.01129  -82.04832        0 2019-01-01
## 2  49.15905 -118.18646        0 2019-01-01
## 3  43.15070 -112.15448        0 2019-01-01
## 4  47.03433 -112.56107        0 2019-01-01
## 5  38.67500  -78.63246        0 2019-01-01
## 6  40.65338  -76.15267        0 2019-01-01
str(df_train1)
## 'data.frame':    1296675 obs. of  15 variables:
##  $ trans_date_trans_time: chr  "2019-01-01 00:00:18" "2019-01-01 00:00:44" "2019-01-01 00:00:51" "2019-01-01 00:01:16" ...
##  $ category             : chr  "misc_net" "grocery_pos" "entertainment" "gas_transport" ...
##  $ amt                  : num  4.97 107.23 220.11 45 41.96 ...
##  $ gender               : chr  "F" "F" "M" "M" ...
##  $ city                 : chr  "Moravian Falls" "Orient" "Malad City" "Boulder" ...
##  $ state                : chr  "NC" "WA" "ID" "MT" ...
##  $ lat                  : num  36.1 48.9 42.2 46.2 38.4 ...
##  $ long                 : num  -81.2 -118.2 -112.3 -112.1 -79.5 ...
##  $ city_pop             : int  3495 149 4154 1939 99 2158 2691 6018 1472 151785 ...
##  $ job                  : chr  "Psychologist, counselling" "Special educational needs teacher" "Nature conservation officer" "Patent attorney" ...
##  $ dob                  : POSIXct, format: "1988-03-09" "1978-06-21" ...
##  $ merch_lat            : num  36 49.2 43.2 47 38.7 ...
##  $ merch_long           : num  -82 -118.2 -112.2 -112.6 -78.6 ...
##  $ is_fraud             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trans_date           : POSIXct, format: "2019-01-01" "2019-01-01" ...
df_test1 <- df_test[,!names(df_test) %in% c("X",'cc_num','merchant','first','last','street','zip','trans_num','unix_time')]
head(df_test1)
##   trans_date_trans_time       category   amt gender       city state     lat
## 1   2020-06-21 12:14:25  personal_care  2.86      M   Columbia    SC 33.9659
## 2   2020-06-21 12:14:33  personal_care 29.84      F    Altonah    UT 40.3207
## 3   2020-06-21 12:14:53 health_fitness 41.28      F   Bellmore    NY 40.6729
## 4   2020-06-21 12:15:15       misc_pos 60.05      M Titusville    FL 28.5697
## 5   2020-06-21 12:15:17         travel  3.19      M   Falmouth    MI 44.2529
## 6   2020-06-21 12:15:37      kids_pets 19.55      F  Breesport    NY 42.1939
##        long city_pop                    job        dob merch_lat merch_long
## 1  -80.9355   333497    Mechanical engineer 1968-03-19  33.98639  -81.20071
## 2 -110.4360      302 Sales professional, IT 1990-01-17  39.45050 -109.96043
## 3  -73.5365    34496      Librarian, public 1970-10-21  40.49581  -74.19611
## 4  -80.8191    54767           Set designer 1987-07-25  28.81240  -80.88306
## 5  -85.0170     1126     Furniture designer 1955-07-06  44.95915  -85.88473
## 6  -76.7361      520        Psychotherapist 1991-10-13  41.74716  -77.58420
##   is_fraud trans_date
## 1        0 2020-06-21
## 2        0 2020-06-21
## 3        0 2020-06-21
## 4        0 2020-06-21
## 5        0 2020-06-21
## 6        0 2020-06-21
str(df_test1)
## 'data.frame':    555719 obs. of  15 variables:
##  $ trans_date_trans_time: chr  "2020-06-21 12:14:25" "2020-06-21 12:14:33" "2020-06-21 12:14:53" "2020-06-21 12:15:15" ...
##  $ category             : chr  "personal_care" "personal_care" "health_fitness" "misc_pos" ...
##  $ amt                  : num  2.86 29.84 41.28 60.05 3.19 ...
##  $ gender               : chr  "M" "F" "F" "M" ...
##  $ city                 : chr  "Columbia" "Altonah" "Bellmore" "Titusville" ...
##  $ state                : chr  "SC" "UT" "NY" "FL" ...
##  $ lat                  : num  34 40.3 40.7 28.6 44.3 ...
##  $ long                 : num  -80.9 -110.4 -73.5 -80.8 -85 ...
##  $ city_pop             : int  333497 302 34496 54767 1126 520 1139 343 3688 263 ...
##  $ job                  : chr  "Mechanical engineer" "Sales professional, IT" "Librarian, public" "Set designer" ...
##  $ dob                  : POSIXct, format: "1968-03-19" "1990-01-17" ...
##  $ merch_lat            : num  34 39.5 40.5 28.8 45 ...
##  $ merch_long           : num  -81.2 -110 -74.2 -80.9 -85.9 ...
##  $ is_fraud             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trans_date           : POSIXct, format: "2020-06-21" "2020-06-21" ...
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
classes_train=df_train1 %>% count(is_fraud)
classes_test=df_test1 %>% count(is_fraud)
normal_share_train=classes_train$n[1]/nrow(df_train1) * 100
head(normal_share_train)
## [1] 99.42113
fraud_share_train=classes_train$n[2]/nrow(df_train1) * 100
head(fraud_share_train)
## [1] 0.5788652
normal_share_test=classes_test$n[1]/nrow(df_test1) * 100
head(normal_share_test)
## [1] 99.61401
fraud_share_test=classes_test$n[2]/nrow(df_test1) * 100
head(fraud_share_test)
## [1] 0.3859864
#install.packages("ggplot2")
library(ggplot2)
Train_data=cbind(normal_share_train, fraud_share_train)
barplot(Train_data,beside=T,ylab="Number of transactions",main="Train_data",col=c("darkgreen","red"))

Test_data=cbind(normal_share_test, fraud_share_test)
barplot(Test_data,beside=T,ylab="Number of transactions",main="Test_data",col=c("darkgreen","red"))

df_train1$age_at_trans <- (df_train1$trans_date - df_train1$dob)
head(df_train1)
##   trans_date_trans_time      category    amt gender           city state
## 1   2019-01-01 00:00:18      misc_net   4.97      F Moravian Falls    NC
## 2   2019-01-01 00:00:44   grocery_pos 107.23      F         Orient    WA
## 3   2019-01-01 00:00:51 entertainment 220.11      M     Malad City    ID
## 4   2019-01-01 00:01:16 gas_transport  45.00      M        Boulder    MT
## 5   2019-01-01 00:03:06      misc_pos  41.96      M       Doe Hill    VA
## 6   2019-01-01 00:04:08 gas_transport  94.63      F         Dublin    PA
##       lat      long city_pop                               job        dob
## 1 36.0788  -81.1781     3495         Psychologist, counselling 1988-03-09
## 2 48.8878 -118.2105      149 Special educational needs teacher 1978-06-21
## 3 42.1808 -112.2620     4154       Nature conservation officer 1962-01-19
## 4 46.2306 -112.1138     1939                   Patent attorney 1967-01-12
## 5 38.4207  -79.4629       99    Dance movement psychotherapist 1986-03-28
## 6 40.3750  -75.2045     2158                 Transport planner 1961-06-19
##   merch_lat merch_long is_fraud trans_date age_at_trans
## 1  36.01129  -82.04832        0 2019-01-01   11255 days
## 2  49.15905 -118.18646        0 2019-01-01   14804 days
## 3  43.15070 -112.15448        0 2019-01-01   20801 days
## 4  47.03433 -112.56107        0 2019-01-01   18982 days
## 5  38.67500  -78.63246        0 2019-01-01   11967 days
## 6  40.65338  -76.15267        0 2019-01-01   21015 days
df_test1$age_at_trans <- (df_test1$trans_date - df_test1$dob)
head(df_test1)
##   trans_date_trans_time       category   amt gender       city state     lat
## 1   2020-06-21 12:14:25  personal_care  2.86      M   Columbia    SC 33.9659
## 2   2020-06-21 12:14:33  personal_care 29.84      F    Altonah    UT 40.3207
## 3   2020-06-21 12:14:53 health_fitness 41.28      F   Bellmore    NY 40.6729
## 4   2020-06-21 12:15:15       misc_pos 60.05      M Titusville    FL 28.5697
## 5   2020-06-21 12:15:17         travel  3.19      M   Falmouth    MI 44.2529
## 6   2020-06-21 12:15:37      kids_pets 19.55      F  Breesport    NY 42.1939
##        long city_pop                    job        dob merch_lat merch_long
## 1  -80.9355   333497    Mechanical engineer 1968-03-19  33.98639  -81.20071
## 2 -110.4360      302 Sales professional, IT 1990-01-17  39.45050 -109.96043
## 3  -73.5365    34496      Librarian, public 1970-10-21  40.49581  -74.19611
## 4  -80.8191    54767           Set designer 1987-07-25  28.81240  -80.88306
## 5  -85.0170     1126     Furniture designer 1955-07-06  44.95915  -85.88473
## 6  -76.7361      520        Psychotherapist 1991-10-13  41.74716  -77.58420
##   is_fraud trans_date age_at_trans
## 1        0 2020-06-21   19087 days
## 2        0 2020-06-21   11113 days
## 3        0 2020-06-21   18141 days
## 4        0 2020-06-21   12020 days
## 5        0 2020-06-21   23727 days
## 6        0 2020-06-21   10479 days
str(df_test1)
## 'data.frame':    555719 obs. of  16 variables:
##  $ trans_date_trans_time: chr  "2020-06-21 12:14:25" "2020-06-21 12:14:33" "2020-06-21 12:14:53" "2020-06-21 12:15:15" ...
##  $ category             : chr  "personal_care" "personal_care" "health_fitness" "misc_pos" ...
##  $ amt                  : num  2.86 29.84 41.28 60.05 3.19 ...
##  $ gender               : chr  "M" "F" "F" "M" ...
##  $ city                 : chr  "Columbia" "Altonah" "Bellmore" "Titusville" ...
##  $ state                : chr  "SC" "UT" "NY" "FL" ...
##  $ lat                  : num  34 40.3 40.7 28.6 44.3 ...
##  $ long                 : num  -80.9 -110.4 -73.5 -80.8 -85 ...
##  $ city_pop             : int  333497 302 34496 54767 1126 520 1139 343 3688 263 ...
##  $ job                  : chr  "Mechanical engineer" "Sales professional, IT" "Librarian, public" "Set designer" ...
##  $ dob                  : POSIXct, format: "1968-03-19" "1990-01-17" ...
##  $ merch_lat            : num  34 39.5 40.5 28.8 45 ...
##  $ merch_long           : num  -81.2 -110 -74.2 -80.9 -85.9 ...
##  $ is_fraud             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trans_date           : POSIXct, format: "2020-06-21" "2020-06-21" ...
##  $ age_at_trans         : 'difftime' num  19087 11113 18141 12020 ...
##   ..- attr(*, "units")= chr "days"
df_train1$trans_month <- format(df_train1$trans_date,"%m")
head(df_train1)
##   trans_date_trans_time      category    amt gender           city state
## 1   2019-01-01 00:00:18      misc_net   4.97      F Moravian Falls    NC
## 2   2019-01-01 00:00:44   grocery_pos 107.23      F         Orient    WA
## 3   2019-01-01 00:00:51 entertainment 220.11      M     Malad City    ID
## 4   2019-01-01 00:01:16 gas_transport  45.00      M        Boulder    MT
## 5   2019-01-01 00:03:06      misc_pos  41.96      M       Doe Hill    VA
## 6   2019-01-01 00:04:08 gas_transport  94.63      F         Dublin    PA
##       lat      long city_pop                               job        dob
## 1 36.0788  -81.1781     3495         Psychologist, counselling 1988-03-09
## 2 48.8878 -118.2105      149 Special educational needs teacher 1978-06-21
## 3 42.1808 -112.2620     4154       Nature conservation officer 1962-01-19
## 4 46.2306 -112.1138     1939                   Patent attorney 1967-01-12
## 5 38.4207  -79.4629       99    Dance movement psychotherapist 1986-03-28
## 6 40.3750  -75.2045     2158                 Transport planner 1961-06-19
##   merch_lat merch_long is_fraud trans_date age_at_trans trans_month
## 1  36.01129  -82.04832        0 2019-01-01   11255 days          01
## 2  49.15905 -118.18646        0 2019-01-01   14804 days          01
## 3  43.15070 -112.15448        0 2019-01-01   20801 days          01
## 4  47.03433 -112.56107        0 2019-01-01   18982 days          01
## 5  38.67500  -78.63246        0 2019-01-01   11967 days          01
## 6  40.65338  -76.15267        0 2019-01-01   21015 days          01
df_test1$trans_month <- format(df_test1$trans_date,"%m")
head(df_test1)
##   trans_date_trans_time       category   amt gender       city state     lat
## 1   2020-06-21 12:14:25  personal_care  2.86      M   Columbia    SC 33.9659
## 2   2020-06-21 12:14:33  personal_care 29.84      F    Altonah    UT 40.3207
## 3   2020-06-21 12:14:53 health_fitness 41.28      F   Bellmore    NY 40.6729
## 4   2020-06-21 12:15:15       misc_pos 60.05      M Titusville    FL 28.5697
## 5   2020-06-21 12:15:17         travel  3.19      M   Falmouth    MI 44.2529
## 6   2020-06-21 12:15:37      kids_pets 19.55      F  Breesport    NY 42.1939
##        long city_pop                    job        dob merch_lat merch_long
## 1  -80.9355   333497    Mechanical engineer 1968-03-19  33.98639  -81.20071
## 2 -110.4360      302 Sales professional, IT 1990-01-17  39.45050 -109.96043
## 3  -73.5365    34496      Librarian, public 1970-10-21  40.49581  -74.19611
## 4  -80.8191    54767           Set designer 1987-07-25  28.81240  -80.88306
## 5  -85.0170     1126     Furniture designer 1955-07-06  44.95915  -85.88473
## 6  -76.7361      520        Psychotherapist 1991-10-13  41.74716  -77.58420
##   is_fraud trans_date age_at_trans trans_month
## 1        0 2020-06-21   19087 days          06
## 2        0 2020-06-21   11113 days          06
## 3        0 2020-06-21   18141 days          06
## 4        0 2020-06-21   12020 days          06
## 5        0 2020-06-21   23727 days          06
## 6        0 2020-06-21   10479 days          06
str(df_test1)
## 'data.frame':    555719 obs. of  17 variables:
##  $ trans_date_trans_time: chr  "2020-06-21 12:14:25" "2020-06-21 12:14:33" "2020-06-21 12:14:53" "2020-06-21 12:15:15" ...
##  $ category             : chr  "personal_care" "personal_care" "health_fitness" "misc_pos" ...
##  $ amt                  : num  2.86 29.84 41.28 60.05 3.19 ...
##  $ gender               : chr  "M" "F" "F" "M" ...
##  $ city                 : chr  "Columbia" "Altonah" "Bellmore" "Titusville" ...
##  $ state                : chr  "SC" "UT" "NY" "FL" ...
##  $ lat                  : num  34 40.3 40.7 28.6 44.3 ...
##  $ long                 : num  -80.9 -110.4 -73.5 -80.8 -85 ...
##  $ city_pop             : int  333497 302 34496 54767 1126 520 1139 343 3688 263 ...
##  $ job                  : chr  "Mechanical engineer" "Sales professional, IT" "Librarian, public" "Set designer" ...
##  $ dob                  : POSIXct, format: "1968-03-19" "1990-01-17" ...
##  $ merch_lat            : num  34 39.5 40.5 28.8 45 ...
##  $ merch_long           : num  -81.2 -110 -74.2 -80.9 -85.9 ...
##  $ is_fraud             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trans_date           : POSIXct, format: "2020-06-21" "2020-06-21" ...
##  $ age_at_trans         : 'difftime' num  19087 11113 18141 12020 ...
##   ..- attr(*, "units")= chr "days"
##  $ trans_month          : chr  "06" "06" "06" "06" ...
apply(df_train1, 2, function(x) length(unique(x)))
## trans_date_trans_time              category                   amt 
##               1274791                    14                 52928 
##                gender                  city                 state 
##                     2                   894                    51 
##                   lat                  long              city_pop 
##                   968                   969                   879 
##                   job                   dob             merch_lat 
##                   494                   968                917042 
##            merch_long              is_fraud            trans_date 
##               1106872                     2                   537 
##          age_at_trans           trans_month 
##                 30424                    12
apply(df_test1, 2, function(x) length(unique(x)))
## trans_date_trans_time              category                   amt 
##                544760                    14                 37256 
##                gender                  city                 state 
##                     2                   849                    50 
##                   lat                  long              city_pop 
##                   910                   910                   835 
##                   job                   dob             merch_lat 
##                   478                   910                474151 
##            merch_long              is_fraud            trans_date 
##                518539                     2                   194 
##          age_at_trans           trans_month 
##                 27866                     7
df_train2 <- df_train1[,!names(df_train1) %in% c('trans_date_trans_time','city','lat','long','job','dob','merch_lat','merch_long','trans_date')]
head(df_train2)
##        category    amt gender state city_pop is_fraud age_at_trans trans_month
## 1      misc_net   4.97      F    NC     3495        0   11255 days          01
## 2   grocery_pos 107.23      F    WA      149        0   14804 days          01
## 3 entertainment 220.11      M    ID     4154        0   20801 days          01
## 4 gas_transport  45.00      M    MT     1939        0   18982 days          01
## 5      misc_pos  41.96      M    VA       99        0   11967 days          01
## 6 gas_transport  94.63      F    PA     2158        0   21015 days          01
df_test2 <- df_test1[,!names(df_test1) %in% c('trans_date_trans_time','city','lat','long','job','dob','merch_lat','merch_long','trans_date')]
head(df_test2)
##         category   amt gender state city_pop is_fraud age_at_trans trans_month
## 1  personal_care  2.86      M    SC   333497        0   19087 days          06
## 2  personal_care 29.84      F    UT      302        0   11113 days          06
## 3 health_fitness 41.28      F    NY    34496        0   18141 days          06
## 4       misc_pos 60.05      M    FL    54767        0   12020 days          06
## 5         travel  3.19      M    MI     1126        0   23727 days          06
## 6      kids_pets 19.55      F    NY      520        0   10479 days          06
df_train2$gender<-ifelse(df_train2$gender=="M",1,0)
head(df_train2)
##        category    amt gender state city_pop is_fraud age_at_trans trans_month
## 1      misc_net   4.97      0    NC     3495        0   11255 days          01
## 2   grocery_pos 107.23      0    WA      149        0   14804 days          01
## 3 entertainment 220.11      1    ID     4154        0   20801 days          01
## 4 gas_transport  45.00      1    MT     1939        0   18982 days          01
## 5      misc_pos  41.96      1    VA       99        0   11967 days          01
## 6 gas_transport  94.63      0    PA     2158        0   21015 days          01
df_test2$gender<-ifelse(df_test2$gender=="M",1,0)
head(df_test2)
##         category   amt gender state city_pop is_fraud age_at_trans trans_month
## 1  personal_care  2.86      1    SC   333497        0   19087 days          06
## 2  personal_care 29.84      0    UT      302        0   11113 days          06
## 3 health_fitness 41.28      0    NY    34496        0   18141 days          06
## 4       misc_pos 60.05      1    FL    54767        0   12020 days          06
## 5         travel  3.19      1    MI     1126        0   23727 days          06
## 6      kids_pets 19.55      0    NY      520        0   10479 days          06
#install.packages("Hmisc")
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
describe(df_test2)
## df_test2 
## 
##  8  Variables      555719  Observations
## --------------------------------------------------------------------------------
## category 
##        n  missing distinct 
##   555719        0       14 
## 
## lowest : entertainment food_dining   gas_transport grocery_net   grocery_pos  
## highest: misc_pos      personal_care shopping_net  shopping_pos  travel       
## 
## entertainment (40104, 0.072), food_dining (39268, 0.071), gas_transport (56370,
## 0.101), grocery_net (19426, 0.035), grocery_pos (52553, 0.095), health_fitness
## (36674, 0.066), home (52345, 0.094), kids_pets (48692, 0.088), misc_net (27367,
## 0.049), misc_pos (34574, 0.062), personal_care (39327, 0.071), shopping_net
## (41779, 0.075), shopping_pos (49791, 0.090), travel (17449, 0.031)
## --------------------------------------------------------------------------------
## amt 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   555719        0    37256        1    69.39    80.26     2.43     4.08 
##      .25      .50      .75      .90      .95 
##     9.63    47.29    83.01   135.55   193.05 
## 
## lowest :     1.00     1.01     1.02     1.03     1.04
## highest: 16339.26 16837.08 19364.91 21437.71 22768.11
## --------------------------------------------------------------------------------
## gender 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##   555719        0        2    0.743   250833   0.4514   0.4953 
## 
## --------------------------------------------------------------------------------
## state 
##        n  missing distinct 
##   555719        0       50 
## 
## lowest : AK AL AR AZ CA, highest: VT WA WI WV WY
## --------------------------------------------------------------------------------
## city_pop 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   555719        0      835        1    88222   157874      139      260 
##      .25      .50      .75      .90      .95 
##      741     2408    19685   186140   525713 
## 
## lowest :      23      37      43      46      47
## highest: 1577385 1595797 2383912 2504700 2906700
## --------------------------------------------------------------------------------
## is_fraud 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##   555719        0        2    0.012     2145  0.00386  0.00769 
## 
## --------------------------------------------------------------------------------
## age_at_trans [days] 
##        n  missing distinct 
##   555719        0    27866 
## 
## lowest :  5622 days  5623 days  5624 days  5625 days  5626 days
## highest: 35121 days 35122 days 35123 days 35124 days 35126 days
## --------------------------------------------------------------------------------
## trans_month 
##        n  missing distinct 
##   555719        0        7 
## 
## lowest : 06 07 08 09 10, highest: 08 09 10 11 12
##                                                            
## Value           6      7      8      9     10     11     12
## Frequency   30058  85848  88759  69533  69348  72635 139538
## Proportion  0.054  0.154  0.160  0.125  0.125  0.131  0.251
## --------------------------------------------------------------------------------
#install.packages("fastDummies")
library(fastDummies)
df_train3 <- dummy_cols(df_train2, select_columns = "category")
head(df_train3)
##        category    amt gender state city_pop is_fraud age_at_trans trans_month
## 1      misc_net   4.97      0    NC     3495        0   11255 days          01
## 2   grocery_pos 107.23      0    WA      149        0   14804 days          01
## 3 entertainment 220.11      1    ID     4154        0   20801 days          01
## 4 gas_transport  45.00      1    MT     1939        0   18982 days          01
## 5      misc_pos  41.96      1    VA       99        0   11967 days          01
## 6 gas_transport  94.63      0    PA     2158        0   21015 days          01
##   category_entertainment category_food_dining category_gas_transport
## 1                      0                    0                      0
## 2                      0                    0                      0
## 3                      1                    0                      0
## 4                      0                    0                      1
## 5                      0                    0                      0
## 6                      0                    0                      1
##   category_grocery_net category_grocery_pos category_health_fitness
## 1                    0                    0                       0
## 2                    0                    1                       0
## 3                    0                    0                       0
## 4                    0                    0                       0
## 5                    0                    0                       0
## 6                    0                    0                       0
##   category_home category_kids_pets category_misc_net category_misc_pos
## 1             0                  0                 1                 0
## 2             0                  0                 0                 0
## 3             0                  0                 0                 0
## 4             0                  0                 0                 0
## 5             0                  0                 0                 1
## 6             0                  0                 0                 0
##   category_personal_care category_shopping_net category_shopping_pos
## 1                      0                     0                     0
## 2                      0                     0                     0
## 3                      0                     0                     0
## 4                      0                     0                     0
## 5                      0                     0                     0
## 6                      0                     0                     0
##   category_travel
## 1               0
## 2               0
## 3               0
## 4               0
## 5               0
## 6               0
df_test3 <- dummy_cols(df_test2, select_columns = "category")
head(df_test3)
##         category   amt gender state city_pop is_fraud age_at_trans trans_month
## 1  personal_care  2.86      1    SC   333497        0   19087 days          06
## 2  personal_care 29.84      0    UT      302        0   11113 days          06
## 3 health_fitness 41.28      0    NY    34496        0   18141 days          06
## 4       misc_pos 60.05      1    FL    54767        0   12020 days          06
## 5         travel  3.19      1    MI     1126        0   23727 days          06
## 6      kids_pets 19.55      0    NY      520        0   10479 days          06
##   category_entertainment category_food_dining category_gas_transport
## 1                      0                    0                      0
## 2                      0                    0                      0
## 3                      0                    0                      0
## 4                      0                    0                      0
## 5                      0                    0                      0
## 6                      0                    0                      0
##   category_grocery_net category_grocery_pos category_health_fitness
## 1                    0                    0                       0
## 2                    0                    0                       0
## 3                    0                    0                       1
## 4                    0                    0                       0
## 5                    0                    0                       0
## 6                    0                    0                       0
##   category_home category_kids_pets category_misc_net category_misc_pos
## 1             0                  0                 0                 0
## 2             0                  0                 0                 0
## 3             0                  0                 0                 0
## 4             0                  0                 0                 1
## 5             0                  0                 0                 0
## 6             0                  1                 0                 0
##   category_personal_care category_shopping_net category_shopping_pos
## 1                      1                     0                     0
## 2                      1                     0                     0
## 3                      0                     0                     0
## 4                      0                     0                     0
## 5                      0                     0                     0
## 6                      0                     0                     0
##   category_travel
## 1               0
## 2               0
## 3               0
## 4               0
## 5               1
## 6               0
df_train4 <- df_train3[,!names(df_train3) %in% c("state")]
head(df_train4)
##        category    amt gender city_pop is_fraud age_at_trans trans_month
## 1      misc_net   4.97      0     3495        0   11255 days          01
## 2   grocery_pos 107.23      0      149        0   14804 days          01
## 3 entertainment 220.11      1     4154        0   20801 days          01
## 4 gas_transport  45.00      1     1939        0   18982 days          01
## 5      misc_pos  41.96      1       99        0   11967 days          01
## 6 gas_transport  94.63      0     2158        0   21015 days          01
##   category_entertainment category_food_dining category_gas_transport
## 1                      0                    0                      0
## 2                      0                    0                      0
## 3                      1                    0                      0
## 4                      0                    0                      1
## 5                      0                    0                      0
## 6                      0                    0                      1
##   category_grocery_net category_grocery_pos category_health_fitness
## 1                    0                    0                       0
## 2                    0                    1                       0
## 3                    0                    0                       0
## 4                    0                    0                       0
## 5                    0                    0                       0
## 6                    0                    0                       0
##   category_home category_kids_pets category_misc_net category_misc_pos
## 1             0                  0                 1                 0
## 2             0                  0                 0                 0
## 3             0                  0                 0                 0
## 4             0                  0                 0                 0
## 5             0                  0                 0                 1
## 6             0                  0                 0                 0
##   category_personal_care category_shopping_net category_shopping_pos
## 1                      0                     0                     0
## 2                      0                     0                     0
## 3                      0                     0                     0
## 4                      0                     0                     0
## 5                      0                     0                     0
## 6                      0                     0                     0
##   category_travel
## 1               0
## 2               0
## 3               0
## 4               0
## 5               0
## 6               0
df_test4 <- df_test3[,!names(df_test3) %in% c("state")]
head(df_test4)
##         category   amt gender city_pop is_fraud age_at_trans trans_month
## 1  personal_care  2.86      1   333497        0   19087 days          06
## 2  personal_care 29.84      0      302        0   11113 days          06
## 3 health_fitness 41.28      0    34496        0   18141 days          06
## 4       misc_pos 60.05      1    54767        0   12020 days          06
## 5         travel  3.19      1     1126        0   23727 days          06
## 6      kids_pets 19.55      0      520        0   10479 days          06
##   category_entertainment category_food_dining category_gas_transport
## 1                      0                    0                      0
## 2                      0                    0                      0
## 3                      0                    0                      0
## 4                      0                    0                      0
## 5                      0                    0                      0
## 6                      0                    0                      0
##   category_grocery_net category_grocery_pos category_health_fitness
## 1                    0                    0                       0
## 2                    0                    0                       0
## 3                    0                    0                       1
## 4                    0                    0                       0
## 5                    0                    0                       0
## 6                    0                    0                       0
##   category_home category_kids_pets category_misc_net category_misc_pos
## 1             0                  0                 0                 0
## 2             0                  0                 0                 0
## 3             0                  0                 0                 0
## 4             0                  0                 0                 1
## 5             0                  0                 0                 0
## 6             0                  1                 0                 0
##   category_personal_care category_shopping_net category_shopping_pos
## 1                      1                     0                     0
## 2                      1                     0                     0
## 3                      0                     0                     0
## 4                      0                     0                     0
## 5                      0                     0                     0
## 6                      0                     0                     0
##   category_travel
## 1               0
## 2               0
## 3               0
## 4               0
## 5               1
## 6               0
X_train = df_train4[,!names(df_train4) %in% c("is_fraud")]
y_train = df_train4$is_fraud
X_test = df_test4[,!names(df_test4) %in% c("is_fraud")]
y_test = df_test4$is_fraud
#library(randomForest)
#rf = randomForest(x = X_train,
#                  y = y_train,n_estimators=10, max_depth=5, max_features=12, min_samples_leaf=100, random_state=100, oob_score=True)
#head(rf)
#install.packages("pROC")
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc1=roc(X_train[, 3], y_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc1)

roc1$auc
## Area under the curve: 0.5006
library(rpart)
orig_fit <- rpart(is_fraud ~ ., data = df_train4)
pred_orig <- predict(orig_fit, newdata = df_test4, method = "class")
roc(df_test4$is_fraud, pred_orig, plotit = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Call:
## roc.default(response = df_test4$is_fraud, predictor = pred_orig,     plotit = TRUE)
## 
## Data: pred_orig in 553574 controls (df_test4$is_fraud 0) < 2145 cases (df_test4$is_fraud 1).
## Area under the curve: 0.9281
library(pastecs)
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
stat.desc(df_train4)
##          category          amt       gender     city_pop     is_fraud
## nbr.val        NA 1.296675e+06 1.296675e+06 1.296675e+06 1.296675e+06
## nbr.null       NA 0.000000e+00 7.098630e+05 0.000000e+00 1.289169e+06
## nbr.na         NA 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## min            NA 1.000000e+00 0.000000e+00 2.300000e+01 0.000000e+00
## max            NA 2.894890e+04 1.000000e+00 2.906700e+06 1.000000e+00
## range          NA 2.894790e+04 1.000000e+00 2.906677e+06 1.000000e+00
## sum            NA 9.122243e+07 5.868120e+05 1.151764e+11 7.506000e+03
## median         NA 4.752000e+01 0.000000e+00 2.456000e+03 0.000000e+00
## mean           NA 7.035104e+01 4.525513e-01 8.882444e+04 5.788652e-03
## SE.mean        NA 1.407866e-01 4.371095e-04 2.651726e+02 6.662123e-05
## CI.mean        NA 2.759370e-01 8.567196e-04 5.197292e+02 1.305753e-04
## var            NA 2.570123e+04 2.477488e-01 9.117764e+10 5.755148e-03
## std.dev        NA 1.603160e+02 4.977437e-01 3.019564e+05 7.586269e-02
## coef.var       NA 2.278801e+00 1.099861e+00 3.399474e+00 1.310542e+01
##          age_at_trans trans_month category_entertainment category_food_dining
## nbr.val  1.296675e+06          NA           1.296675e+06         1.296675e+06
## nbr.null 0.000000e+00          NA           1.202661e+06         1.205214e+06
## nbr.na   0.000000e+00          NA           0.000000e+00         0.000000e+00
## min      5.085000e+03          NA           0.000000e+00         0.000000e+00
## max      3.493200e+04          NA           1.000000e+00         1.000000e+00
## range    2.984700e+04          NA           1.000000e+00         1.000000e+00
## sum      2.178438e+10          NA           9.401400e+04         9.146100e+04
## median   1.606000e+04          NA           0.000000e+00         0.000000e+00
## mean     1.680019e+04          NA           7.250390e-02         7.053502e-02
## SE.mean  5.579236e+00          NA           2.277306e-04         2.248555e-04
## CI.mean  1.093511e+01          NA           4.463441e-04         4.407091e-04
## var      4.036274e+07          NA           6.724714e-02         6.555988e-02
## std.dev  6.353168e+03          NA           2.593205e-01         2.560466e-01
## coef.var 3.781606e-01          NA           3.576642e+00         3.630064e+00
##          category_gas_transport category_grocery_net category_grocery_pos
## nbr.val            1.296675e+06         1.296675e+06         1.296675e+06
## nbr.null           1.165016e+06         1.251223e+06         1.173037e+06
## nbr.na             0.000000e+00         0.000000e+00         0.000000e+00
## min                0.000000e+00         0.000000e+00         0.000000e+00
## max                1.000000e+00         1.000000e+00         1.000000e+00
## range              1.000000e+00         1.000000e+00         1.000000e+00
## sum                1.316590e+05         4.545200e+04         1.236380e+05
## median             0.000000e+00         0.000000e+00         0.000000e+00
## mean               1.015359e-01         3.505273e-02         9.535003e-02
## SE.mean            2.652435e-04         1.615092e-04         2.579202e-04
## CI.mean            5.198681e-04         3.165525e-04         5.055147e-04
## var                9.122639e-02         3.382406e-02         8.625847e-02
## std.dev            3.020371e-01         1.839132e-01         2.936979e-01
## coef.var           2.974684e+00         5.246758e+00         3.080208e+00
##          category_health_fitness category_home category_kids_pets
## nbr.val             1.296675e+06  1.296675e+06       1.296675e+06
## nbr.null            1.210796e+06  1.173560e+06       1.183640e+06
## nbr.na              0.000000e+00  0.000000e+00       0.000000e+00
## min                 0.000000e+00  0.000000e+00       0.000000e+00
## max                 1.000000e+00  1.000000e+00       1.000000e+00
## range               1.000000e+00  1.000000e+00       1.000000e+00
## sum                 8.587900e+04  1.231150e+05       1.130350e+05
## median              0.000000e+00  0.000000e+00       0.000000e+00
## mean                6.623017e-02  9.494669e-02       8.717296e-02
## SE.mean             2.183898e-04  2.574314e-04       2.477249e-04
## CI.mean             4.280366e-04  5.045568e-04       4.855324e-04
## var                 6.184378e-02  8.593188e-02       7.957390e-02
## std.dev             2.486841e-01  2.931414e-01       2.820885e-01
## coef.var            3.754846e+00  3.087432e+00       3.235963e+00
##          category_misc_net category_misc_pos category_personal_care
## nbr.val       1.296675e+06      1.296675e+06           1.296675e+06
## nbr.null      1.233388e+06      1.217020e+06           1.205917e+06
## nbr.na        0.000000e+00      0.000000e+00           0.000000e+00
## min           0.000000e+00      0.000000e+00           0.000000e+00
## max           1.000000e+00      1.000000e+00           1.000000e+00
## range         1.000000e+00      1.000000e+00           1.000000e+00
## sum           6.328700e+04      7.965500e+04           9.075800e+04
## median        0.000000e+00      0.000000e+00           0.000000e+00
## mean          4.880714e-02      6.143020e-02           6.999287e-02
## SE.mean       1.892172e-04      2.108671e-04           2.240550e-04
## CI.mean       3.708592e-04      4.132923e-04           4.391401e-04
## var           4.642504e-02      5.765657e-02           6.509392e-02
## std.dev       2.154647e-01      2.401178e-01           2.551351e-01
## coef.var      4.414614e+00      3.908791e+00           3.645159e+00
##          category_shopping_net category_shopping_pos category_travel
## nbr.val           1.296675e+06          1.296675e+06    1.296675e+06
## nbr.null          1.199132e+06          1.180003e+06    1.256168e+06
## nbr.na            0.000000e+00          0.000000e+00    0.000000e+00
## min               0.000000e+00          0.000000e+00    0.000000e+00
## max               1.000000e+00          1.000000e+00    1.000000e+00
## range             1.000000e+00          1.000000e+00    1.000000e+00
## sum               9.754300e+04          1.166720e+05    4.050700e+04
## median            0.000000e+00          0.000000e+00    0.000000e+00
## mean              7.522548e-02          8.997783e-02    3.123913e-02
## SE.mean           2.316248e-04          2.512918e-04    1.527715e-04
## CI.mean           4.539766e-04          4.925234e-04    2.994269e-04
## var               6.956666e-02          8.188188e-02    3.026327e-02
## std.dev           2.637549e-01          2.861501e-01    1.739634e-01
## coef.var          3.506191e+00          3.180229e+00    5.568766e+00
stat.desc(df_test4)
##          category          amt       gender     city_pop     is_fraud
## nbr.val        NA 5.557190e+05 5.557190e+05 5.557190e+05 5.557190e+05
## nbr.null       NA 0.000000e+00 3.048860e+05 0.000000e+00 5.535740e+05
## nbr.na         NA 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## min            NA 1.000000e+00 0.000000e+00 2.300000e+01 0.000000e+00
## max            NA 2.276811e+04 1.000000e+00 2.906700e+06 1.000000e+00
## range          NA 2.276711e+04 1.000000e+00 2.906677e+06 1.000000e+00
## sum            NA 3.856290e+07 2.508330e+05 4.902658e+10 2.145000e+03
## median         NA 4.729000e+01 0.000000e+00 2.408000e+03 0.000000e+00
## mean           NA 6.939281e+01 4.513666e-01 8.822189e+04 3.859864e-03
## SE.mean        NA 2.102658e-01 6.675420e-04 4.029574e+02 8.318002e-05
## CI.mean        NA 4.121143e-01 1.308361e-03 7.897837e+02 1.630302e-04
## var            NA 2.456929e+04 2.476352e-01 9.023469e+10 3.844973e-03
## std.dev        NA 1.567459e+02 4.976296e-01 3.003909e+05 6.200784e-02
## coef.var       NA 2.258821e+00 1.102495e+00 3.404947e+00 1.606477e+01
##          age_at_trans trans_month category_entertainment category_food_dining
## nbr.val  5.557190e+05          NA           5.557190e+05         5.557190e+05
## nbr.null 0.000000e+00          NA           5.156150e+05         5.164510e+05
## nbr.na   0.000000e+00          NA           0.000000e+00         0.000000e+00
## min      5.622000e+03          NA           0.000000e+00         0.000000e+00
## max      3.512600e+04          NA           1.000000e+00         1.000000e+00
## range    2.950400e+04          NA           1.000000e+00         1.000000e+00
## sum      9.517154e+09          NA           4.010400e+04         3.926800e+04
## median   1.640300e+04          NA           0.000000e+00         0.000000e+00
## mean     1.712584e+04          NA           7.216597e-02         7.066161e-02
## SE.mean  8.540633e+00          NA           3.471156e-04         3.437569e-04
## CI.mean  1.673937e+01          NA           6.803356e-04         6.737527e-04
## var      4.053549e+07          NA           6.695816e-02         6.566867e-02
## std.dev  6.366749e+03          NA           2.587628e-01         2.562590e-01
## coef.var 3.717628e-01          NA           3.585662e+00         3.626566e+00
##          category_gas_transport category_grocery_net category_grocery_pos
## nbr.val            5.557190e+05         5.557190e+05         5.557190e+05
## nbr.null           4.993490e+05         5.362930e+05         5.031660e+05
## nbr.na             0.000000e+00         0.000000e+00         0.000000e+00
## min                0.000000e+00         0.000000e+00         0.000000e+00
## max                1.000000e+00         1.000000e+00         1.000000e+00
## range              1.000000e+00         1.000000e+00         1.000000e+00
## sum                5.637000e+04         1.942600e+04         5.255300e+04
## median             0.000000e+00         0.000000e+00         0.000000e+00
## mean               1.014362e-01         3.495652e-02         9.456758e-02
## SE.mean            4.049894e-04         2.463827e-04         3.925292e-04
## CI.mean            7.937663e-04         4.829023e-04         7.693447e-04
## var                9.114703e-02         3.373462e-02         8.562471e-02
## std.dev            3.019057e-01         1.836699e-01         2.926170e-01
## coef.var           2.976312e+00         5.254238e+00         3.094263e+00
##          category_health_fitness category_home category_kids_pets
## nbr.val             5.557190e+05  5.557190e+05       5.557190e+05
## nbr.null            5.190450e+05  5.033740e+05       5.070270e+05
## nbr.na              0.000000e+00  0.000000e+00       0.000000e+00
## min                 0.000000e+00  0.000000e+00       0.000000e+00
## max                 1.000000e+00  1.000000e+00       1.000000e+00
## range               1.000000e+00  1.000000e+00       1.000000e+00
## sum                 3.667400e+04  5.234500e+04       4.869200e+04
## median              0.000000e+00  0.000000e+00       0.000000e+00
## mean                6.599378e-02  9.419329e-02       8.761982e-02
## SE.mean             3.330421e-04  3.918326e-04       3.792817e-04
## CI.mean             6.527520e-04  7.679794e-04       7.433801e-04
## var                 6.163872e-02  8.532107e-02       7.994273e-02
## std.dev             2.482715e-01  2.920977e-01       2.827415e-01
## coef.var            3.762043e+00  3.101046e+00       3.226912e+00
##          category_misc_net category_misc_pos category_personal_care
## nbr.val       5.557190e+05      5.557190e+05           5.557190e+05
## nbr.null      5.283520e+05      5.211450e+05           5.163920e+05
## nbr.na        0.000000e+00      0.000000e+00           0.000000e+00
## min           0.000000e+00      0.000000e+00           0.000000e+00
## max           1.000000e+00      1.000000e+00           1.000000e+00
## range         1.000000e+00      1.000000e+00           1.000000e+00
## sum           2.736700e+04      3.457400e+04           3.932700e+04
## median        0.000000e+00      0.000000e+00           0.000000e+00
## mean          4.924611e-02      6.221490e-02           7.076778e-02
## SE.mean       2.902637e-04      3.240199e-04           3.439954e-04
## CI.mean       5.689077e-04      6.350686e-04           6.742201e-04
## var           4.682102e-02      5.834431e-02           6.575982e-02
## std.dev       2.163816e-01      2.415457e-01           2.564368e-01
## coef.var      4.393883e+00      3.882441e+00           3.623637e+00
##          category_shopping_net category_shopping_pos category_travel
## nbr.val           5.557190e+05          5.557190e+05    5.557190e+05
## nbr.null          5.139400e+05          5.059280e+05    5.382700e+05
## nbr.na            0.000000e+00          0.000000e+00    0.000000e+00
## min               0.000000e+00          0.000000e+00    0.000000e+00
## max               1.000000e+00          1.000000e+00    1.000000e+00
## range             1.000000e+00          1.000000e+00    1.000000e+00
## sum               4.177900e+04          4.979100e+04    1.744900e+04
## median            0.000000e+00          0.000000e+00    0.000000e+00
## mean              7.518008e-02          8.959744e-02    3.139896e-02
## SE.mean           3.537144e-04          3.831222e-04    2.339391e-04
## CI.mean           6.932690e-04          7.509073e-04    4.585133e-04
## var               6.952816e-02          8.156989e-02    3.041312e-02
## std.dev           2.636819e-01          2.856044e-01    1.743936e-01
## coef.var          3.507338e+00          3.187640e+00    5.554119e+00
df_train_merge = data.frame(df_train$trans_date_trans_time,df_train$amt,df_train$trans_num, df_train$is_fraud)
head(df_train_merge)
##   df_train.trans_date_trans_time df_train.amt               df_train.trans_num
## 1            2019-01-01 00:00:18         4.97 0b242abb623afc578575680df30655b9
## 2            2019-01-01 00:00:44       107.23 1f76529f8574734946361c461b024d99
## 3            2019-01-01 00:00:51       220.11 a1a22d70485983eac12b5b88dad1cf95
## 4            2019-01-01 00:01:16        45.00 6b849c168bdad6f867558c3793159a81
## 5            2019-01-01 00:03:06        41.96 a41d7549acf90789359a9aa5346dcb46
## 6            2019-01-01 00:04:08        94.63 189a841a0a8ba03058526bcfe566aab5
##   df_train.is_fraud
## 1                 0
## 2                 0
## 3                 0
## 4                 0
## 5                 0
## 6                 0
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
df_train_pred=ifelse(y_train>0.16,1,0)
head(df_train_pred)
## [1] 0 0 0 0 0 0
df_train_final = cbind(df_train_merge, df_train_pred)
head(df_train_final)
##   df_train.trans_date_trans_time df_train.amt               df_train.trans_num
## 1            2019-01-01 00:00:18         4.97 0b242abb623afc578575680df30655b9
## 2            2019-01-01 00:00:44       107.23 1f76529f8574734946361c461b024d99
## 3            2019-01-01 00:00:51       220.11 a1a22d70485983eac12b5b88dad1cf95
## 4            2019-01-01 00:01:16        45.00 6b849c168bdad6f867558c3793159a81
## 5            2019-01-01 00:03:06        41.96 a41d7549acf90789359a9aa5346dcb46
## 6            2019-01-01 00:04:08        94.63 189a841a0a8ba03058526bcfe566aab5
##   df_train.is_fraud df_train_pred
## 1                 0             0
## 2                 0             0
## 3                 0             0
## 4                 0             0
## 5                 0             0
## 6                 0             0
names(df_train_final)[names(df_train_final) == "df_train_pred"] = "is_fraud_pred"
head(df_train_final)
##   df_train.trans_date_trans_time df_train.amt               df_train.trans_num
## 1            2019-01-01 00:00:18         4.97 0b242abb623afc578575680df30655b9
## 2            2019-01-01 00:00:44       107.23 1f76529f8574734946361c461b024d99
## 3            2019-01-01 00:00:51       220.11 a1a22d70485983eac12b5b88dad1cf95
## 4            2019-01-01 00:01:16        45.00 6b849c168bdad6f867558c3793159a81
## 5            2019-01-01 00:03:06        41.96 a41d7549acf90789359a9aa5346dcb46
## 6            2019-01-01 00:04:08        94.63 189a841a0a8ba03058526bcfe566aab5
##   df_train.is_fraud is_fraud_pred
## 1                 0             0
## 2                 0             0
## 3                 0             0
## 4                 0             0
## 5                 0             0
## 6                 0             0
library(dplyr)
df_train_final %>% count(df_train.is_fraud)
##   df_train.is_fraud       n
## 1                 0 1289169
## 2                 1    7506
df_train_final %>% count(is_fraud_pred)
##   is_fraud_pred       n
## 1             0 1289169
## 2             1    7506
fraud = rbind(df_train, df_test)
head(fraud)
##   X trans_date_trans_time       cc_num                           merchant
## 1 0   2019-01-01 00:00:18 2.703186e+15         fraud_Rippin, Kub and Mann
## 2 1   2019-01-01 00:00:44 6.304233e+11    fraud_Heller, Gutmann and Zieme
## 3 2   2019-01-01 00:00:51 3.885949e+13               fraud_Lind-Buckridge
## 4 3   2019-01-01 00:01:16 3.534094e+15 fraud_Kutch, Hermiston and Farrell
## 5 4   2019-01-01 00:03:06 3.755342e+14                fraud_Keeling-Crist
## 6 5   2019-01-01 00:04:08 4.767265e+15   fraud_Stroman, Hudson and Erdman
##        category    amt     first    last gender                       street
## 1      misc_net   4.97  Jennifer   Banks      F               561 Perry Cove
## 2   grocery_pos 107.23 Stephanie    Gill      F 43039 Riley Greens Suite 393
## 3 entertainment 220.11    Edward Sanchez      M     594 White Dale Suite 530
## 4 gas_transport  45.00    Jeremy   White      M  9443 Cynthia Court Apt. 038
## 5      misc_pos  41.96     Tyler  Garcia      M             408 Bradley Rest
## 6 gas_transport  94.63  Jennifer  Conner      F            4655 David Island
##             city state   zip     lat      long city_pop
## 1 Moravian Falls    NC 28654 36.0788  -81.1781     3495
## 2         Orient    WA 99160 48.8878 -118.2105      149
## 3     Malad City    ID 83252 42.1808 -112.2620     4154
## 4        Boulder    MT 59632 46.2306 -112.1138     1939
## 5       Doe Hill    VA 24433 38.4207  -79.4629       99
## 6         Dublin    PA 18917 40.3750  -75.2045     2158
##                                 job        dob                        trans_num
## 1         Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9
## 2 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99
## 3       Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95
## 4                   Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81
## 5    Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46
## 6                 Transport planner 1961-06-19 189a841a0a8ba03058526bcfe566aab5
##    unix_time merch_lat merch_long is_fraud trans_date
## 1 1325376018  36.01129  -82.04832        0 2019-01-01
## 2 1325376044  49.15905 -118.18646        0 2019-01-01
## 3 1325376051  43.15070 -112.15448        0 2019-01-01
## 4 1325376076  47.03433 -112.56107        0 2019-01-01
## 5 1325376186  38.67500  -78.63246        0 2019-01-01
## 6 1325376248  40.65338  -76.15267        0 2019-01-01
avg_transactions_pm = nrow(fraud)/24
print(avg_transactions_pm)
## [1] 77183.08

Average number of transactions per month are: 77183

avg_fraudtrans_pm1 = nrow(fraud[fraud$is_fraud=='1',])/24
print(avg_fraudtrans_pm1)
## [1] 402.125

Average number of fraudulent transactions per month are: 402

avg_fraud_amt = sum(fraud$amt[which(fraud$is_fraud == 1)])/nrow(fraud[fraud$is_fraud=='1',])
print(avg_fraud_amt)
## [1] 530.6614

Average amount per fraudulent transactions is: 530.0

cost_before_model = avg_fraud_amt*avg_fraudtrans_pm1
print(cost_before_model)
## [1] 213392.2

the cost incurred based on the first point descibed above:213392.2