df_train = read.csv('/Users/suyogkan/Desktop/Cap/Credit-Card-Fraud-Detection-Capstone-Project/fraudTrain.csv')
head(df_train)
## X trans_date_trans_time cc_num merchant
## 1 0 2019-01-01 00:00:18 2.703186e+15 fraud_Rippin, Kub and Mann
## 2 1 2019-01-01 00:00:44 6.304233e+11 fraud_Heller, Gutmann and Zieme
## 3 2 2019-01-01 00:00:51 3.885949e+13 fraud_Lind-Buckridge
## 4 3 2019-01-01 00:01:16 3.534094e+15 fraud_Kutch, Hermiston and Farrell
## 5 4 2019-01-01 00:03:06 3.755342e+14 fraud_Keeling-Crist
## 6 5 2019-01-01 00:04:08 4.767265e+15 fraud_Stroman, Hudson and Erdman
## category amt first last gender street
## 1 misc_net 4.97 Jennifer Banks F 561 Perry Cove
## 2 grocery_pos 107.23 Stephanie Gill F 43039 Riley Greens Suite 393
## 3 entertainment 220.11 Edward Sanchez M 594 White Dale Suite 530
## 4 gas_transport 45.00 Jeremy White M 9443 Cynthia Court Apt. 038
## 5 misc_pos 41.96 Tyler Garcia M 408 Bradley Rest
## 6 gas_transport 94.63 Jennifer Conner F 4655 David Island
## city state zip lat long city_pop
## 1 Moravian Falls NC 28654 36.0788 -81.1781 3495
## 2 Orient WA 99160 48.8878 -118.2105 149
## 3 Malad City ID 83252 42.1808 -112.2620 4154
## 4 Boulder MT 59632 46.2306 -112.1138 1939
## 5 Doe Hill VA 24433 38.4207 -79.4629 99
## 6 Dublin PA 18917 40.3750 -75.2045 2158
## job dob trans_num
## 1 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9
## 2 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99
## 3 Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95
## 4 Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81
## 5 Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46
## 6 Transport planner 1961-06-19 189a841a0a8ba03058526bcfe566aab5
## unix_time merch_lat merch_long is_fraud
## 1 1325376018 36.01129 -82.04832 0
## 2 1325376044 49.15905 -118.18646 0
## 3 1325376051 43.15070 -112.15448 0
## 4 1325376076 47.03433 -112.56107 0
## 5 1325376186 38.67500 -78.63246 0
## 6 1325376248 40.65338 -76.15267 0
str(df_train)
## 'data.frame': 1296675 obs. of 23 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ trans_date_trans_time: chr "2019-01-01 00:00:18" "2019-01-01 00:00:44" "2019-01-01 00:00:51" "2019-01-01 00:01:16" ...
## $ cc_num : num 2.70e+15 6.30e+11 3.89e+13 3.53e+15 3.76e+14 ...
## $ merchant : chr "fraud_Rippin, Kub and Mann" "fraud_Heller, Gutmann and Zieme" "fraud_Lind-Buckridge" "fraud_Kutch, Hermiston and Farrell" ...
## $ category : chr "misc_net" "grocery_pos" "entertainment" "gas_transport" ...
## $ amt : num 4.97 107.23 220.11 45 41.96 ...
## $ first : chr "Jennifer" "Stephanie" "Edward" "Jeremy" ...
## $ last : chr "Banks" "Gill" "Sanchez" "White" ...
## $ gender : chr "F" "F" "M" "M" ...
## $ street : chr "561 Perry Cove" "43039 Riley Greens Suite 393" "594 White Dale Suite 530" "9443 Cynthia Court Apt. 038" ...
## $ city : chr "Moravian Falls" "Orient" "Malad City" "Boulder" ...
## $ state : chr "NC" "WA" "ID" "MT" ...
## $ zip : int 28654 99160 83252 59632 24433 18917 67851 22824 15665 37040 ...
## $ lat : num 36.1 48.9 42.2 46.2 38.4 ...
## $ long : num -81.2 -118.2 -112.3 -112.1 -79.5 ...
## $ city_pop : int 3495 149 4154 1939 99 2158 2691 6018 1472 151785 ...
## $ job : chr "Psychologist, counselling" "Special educational needs teacher" "Nature conservation officer" "Patent attorney" ...
## $ dob : chr "1988-03-09" "1978-06-21" "1962-01-19" "1967-01-12" ...
## $ trans_num : chr "0b242abb623afc578575680df30655b9" "1f76529f8574734946361c461b024d99" "a1a22d70485983eac12b5b88dad1cf95" "6b849c168bdad6f867558c3793159a81" ...
## $ unix_time : int 1325376018 1325376044 1325376051 1325376076 1325376186 1325376248 1325376282 1325376308 1325376318 1325376361 ...
## $ merch_lat : num 36 49.2 43.2 47 38.7 ...
## $ merch_long : num -82 -118.2 -112.2 -112.6 -78.6 ...
## $ is_fraud : int 0 0 0 0 0 0 0 0 0 0 ...
summary(df_train)
## X trans_date_trans_time cc_num merchant
## Min. : 0 Length:1296675 Min. :6.042e+10 Length:1296675
## 1st Qu.: 324168 Class :character 1st Qu.:1.800e+14 Class :character
## Median : 648337 Mode :character Median :3.521e+15 Mode :character
## Mean : 648337 Mean :4.172e+17
## 3rd Qu.: 972506 3rd Qu.:4.642e+15
## Max. :1296674 Max. :4.992e+18
## category amt first last
## Length:1296675 Min. : 1.00 Length:1296675 Length:1296675
## Class :character 1st Qu.: 9.65 Class :character Class :character
## Mode :character Median : 47.52 Mode :character Mode :character
## Mean : 70.35
## 3rd Qu.: 83.14
## Max. :28948.90
## gender street city state
## Length:1296675 Length:1296675 Length:1296675 Length:1296675
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## zip lat long city_pop
## Min. : 1257 Min. :20.03 Min. :-165.67 Min. : 23
## 1st Qu.:26237 1st Qu.:34.62 1st Qu.: -96.80 1st Qu.: 743
## Median :48174 Median :39.35 Median : -87.48 Median : 2456
## Mean :48801 Mean :38.54 Mean : -90.23 Mean : 88824
## 3rd Qu.:72042 3rd Qu.:41.94 3rd Qu.: -80.16 3rd Qu.: 20328
## Max. :99783 Max. :66.69 Max. : -67.95 Max. :2906700
## job dob trans_num unix_time
## Length:1296675 Length:1296675 Length:1296675 Min. :1.325e+09
## Class :character Class :character Class :character 1st Qu.:1.339e+09
## Mode :character Mode :character Mode :character Median :1.349e+09
## Mean :1.349e+09
## 3rd Qu.:1.359e+09
## Max. :1.372e+09
## merch_lat merch_long is_fraud
## Min. :19.03 Min. :-166.67 Min. :0.000000
## 1st Qu.:34.73 1st Qu.: -96.90 1st Qu.:0.000000
## Median :39.37 Median : -87.44 Median :0.000000
## Mean :38.54 Mean : -90.23 Mean :0.005789
## 3rd Qu.:41.96 3rd Qu.: -80.24 3rd Qu.:0.000000
## Max. :67.51 Max. : -66.95 Max. :1.000000
df_test = read.csv('/Users/suyogkan/Desktop/Cap/Credit-Card-Fraud-Detection-Capstone-Project/fraudTest.csv')
head(df_test)
## X trans_date_trans_time cc_num merchant
## 1 0 2020-06-21 12:14:25 2.291164e+15 fraud_Kirlin and Sons
## 2 1 2020-06-21 12:14:33 3.573030e+15 fraud_Sporer-Keebler
## 3 2 2020-06-21 12:14:53 3.598215e+15 fraud_Swaniawski, Nitzsche and Welch
## 4 3 2020-06-21 12:15:15 3.591920e+15 fraud_Haley Group
## 5 4 2020-06-21 12:15:17 3.526826e+15 fraud_Johnston-Casper
## 6 5 2020-06-21 12:15:37 3.040768e+13 fraud_Daugherty LLC
## category amt first last gender street
## 1 personal_care 2.86 Jeff Elliott M 351 Darlene Green
## 2 personal_care 29.84 Joanne Williams F 3638 Marsh Union
## 3 health_fitness 41.28 Ashley Lopez F 9333 Valentine Point
## 4 misc_pos 60.05 Brian Williams M 32941 Krystal Mill Apt. 552
## 5 travel 3.19 Nathan Massey M 5783 Evan Roads Apt. 465
## 6 kids_pets 19.55 Danielle Evans F 76752 David Lodge Apt. 064
## city state zip lat long city_pop job
## 1 Columbia SC 29209 33.9659 -80.9355 333497 Mechanical engineer
## 2 Altonah UT 84002 40.3207 -110.4360 302 Sales professional, IT
## 3 Bellmore NY 11710 40.6729 -73.5365 34496 Librarian, public
## 4 Titusville FL 32780 28.5697 -80.8191 54767 Set designer
## 5 Falmouth MI 49632 44.2529 -85.0170 1126 Furniture designer
## 6 Breesport NY 14816 42.1939 -76.7361 520 Psychotherapist
## dob trans_num unix_time merch_lat merch_long
## 1 1968-03-19 2da90c7d74bd46a0caf3777415b3ebd3 1371816865 33.98639 -81.20071
## 2 1990-01-17 324cc204407e99f51b0d6ca0055005e7 1371816873 39.45050 -109.96043
## 3 1970-10-21 c81755dbbbea9d5c77f094348a7579be 1371816893 40.49581 -74.19611
## 4 1987-07-25 2159175b9efe66dc301f149d3d5abf8c 1371816915 28.81240 -80.88306
## 5 1955-07-06 57ff021bd3f328f8738bb535c302a31b 1371816917 44.95915 -85.88473
## 6 1991-10-13 798db04aaceb4febd084f1a7c404da93 1371816937 41.74716 -77.58420
## is_fraud
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
str(df_test)
## 'data.frame': 555719 obs. of 23 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ trans_date_trans_time: chr "2020-06-21 12:14:25" "2020-06-21 12:14:33" "2020-06-21 12:14:53" "2020-06-21 12:15:15" ...
## $ cc_num : num 2.29e+15 3.57e+15 3.60e+15 3.59e+15 3.53e+15 ...
## $ merchant : chr "fraud_Kirlin and Sons" "fraud_Sporer-Keebler" "fraud_Swaniawski, Nitzsche and Welch" "fraud_Haley Group" ...
## $ category : chr "personal_care" "personal_care" "health_fitness" "misc_pos" ...
## $ amt : num 2.86 29.84 41.28 60.05 3.19 ...
## $ first : chr "Jeff" "Joanne" "Ashley" "Brian" ...
## $ last : chr "Elliott" "Williams" "Lopez" "Williams" ...
## $ gender : chr "M" "F" "F" "M" ...
## $ street : chr "351 Darlene Green" "3638 Marsh Union" "9333 Valentine Point" "32941 Krystal Mill Apt. 552" ...
## $ city : chr "Columbia" "Altonah" "Bellmore" "Titusville" ...
## $ state : chr "SC" "UT" "NY" "FL" ...
## $ zip : int 29209 84002 11710 32780 49632 14816 95528 57374 16858 76678 ...
## $ lat : num 34 40.3 40.7 28.6 44.3 ...
## $ long : num -80.9 -110.4 -73.5 -80.8 -85 ...
## $ city_pop : int 333497 302 34496 54767 1126 520 1139 343 3688 263 ...
## $ job : chr "Mechanical engineer" "Sales professional, IT" "Librarian, public" "Set designer" ...
## $ dob : chr "1968-03-19" "1990-01-17" "1970-10-21" "1987-07-25" ...
## $ trans_num : chr "2da90c7d74bd46a0caf3777415b3ebd3" "324cc204407e99f51b0d6ca0055005e7" "c81755dbbbea9d5c77f094348a7579be" "2159175b9efe66dc301f149d3d5abf8c" ...
## $ unix_time : int 1371816865 1371816873 1371816893 1371816915 1371816917 1371816937 1371816944 1371816950 1371816970 1371816971 ...
## $ merch_lat : num 34 39.5 40.5 28.8 45 ...
## $ merch_long : num -81.2 -110 -74.2 -80.9 -85.9 ...
## $ is_fraud : int 0 0 0 0 0 0 0 0 0 0 ...
summary(df_test)
## X trans_date_trans_time cc_num merchant
## Min. : 0 Length:555719 Min. :6.042e+10 Length:555719
## 1st Qu.:138930 Class :character 1st Qu.:1.800e+14 Class :character
## Median :277859 Mode :character Median :3.521e+15 Mode :character
## Mean :277859 Mean :4.178e+17
## 3rd Qu.:416788 3rd Qu.:4.635e+15
## Max. :555718 Max. :4.992e+18
## category amt first last
## Length:555719 Min. : 1.00 Length:555719 Length:555719
## Class :character 1st Qu.: 9.63 Class :character Class :character
## Mode :character Median : 47.29 Mode :character Mode :character
## Mean : 69.39
## 3rd Qu.: 83.01
## Max. :22768.11
## gender street city state
## Length:555719 Length:555719 Length:555719 Length:555719
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## zip lat long city_pop
## Min. : 1257 Min. :20.03 Min. :-165.67 Min. : 23
## 1st Qu.:26292 1st Qu.:34.67 1st Qu.: -96.80 1st Qu.: 741
## Median :48174 Median :39.37 Median : -87.48 Median : 2408
## Mean :48843 Mean :38.54 Mean : -90.23 Mean : 88222
## 3rd Qu.:72011 3rd Qu.:41.89 3rd Qu.: -80.18 3rd Qu.: 19685
## Max. :99921 Max. :65.69 Max. : -67.95 Max. :2906700
## job dob trans_num unix_time
## Length:555719 Length:555719 Length:555719 Min. :1.372e+09
## Class :character Class :character Class :character 1st Qu.:1.376e+09
## Mode :character Mode :character Mode :character Median :1.381e+09
## Mean :1.381e+09
## 3rd Qu.:1.386e+09
## Max. :1.389e+09
## merch_lat merch_long is_fraud
## Min. :19.03 Min. :-166.67 Min. :0.00000
## 1st Qu.:34.76 1st Qu.: -96.91 1st Qu.:0.00000
## Median :39.38 Median : -87.45 Median :0.00000
## Mean :38.54 Mean : -90.23 Mean :0.00386
## 3rd Qu.:41.95 3rd Qu.: -80.26 3rd Qu.:0.00000
## Max. :66.68 Max. : -66.95 Max. :1.00000
df_train$trans_date <- as.POSIXct( df_train$trans_date_trans_time, format="%Y-%m-%d" )
df_train$dob <- as.POSIXct( df_train$dob, format="%Y-%m-%d" )
head(df_train)
## X trans_date_trans_time cc_num merchant
## 1 0 2019-01-01 00:00:18 2.703186e+15 fraud_Rippin, Kub and Mann
## 2 1 2019-01-01 00:00:44 6.304233e+11 fraud_Heller, Gutmann and Zieme
## 3 2 2019-01-01 00:00:51 3.885949e+13 fraud_Lind-Buckridge
## 4 3 2019-01-01 00:01:16 3.534094e+15 fraud_Kutch, Hermiston and Farrell
## 5 4 2019-01-01 00:03:06 3.755342e+14 fraud_Keeling-Crist
## 6 5 2019-01-01 00:04:08 4.767265e+15 fraud_Stroman, Hudson and Erdman
## category amt first last gender street
## 1 misc_net 4.97 Jennifer Banks F 561 Perry Cove
## 2 grocery_pos 107.23 Stephanie Gill F 43039 Riley Greens Suite 393
## 3 entertainment 220.11 Edward Sanchez M 594 White Dale Suite 530
## 4 gas_transport 45.00 Jeremy White M 9443 Cynthia Court Apt. 038
## 5 misc_pos 41.96 Tyler Garcia M 408 Bradley Rest
## 6 gas_transport 94.63 Jennifer Conner F 4655 David Island
## city state zip lat long city_pop
## 1 Moravian Falls NC 28654 36.0788 -81.1781 3495
## 2 Orient WA 99160 48.8878 -118.2105 149
## 3 Malad City ID 83252 42.1808 -112.2620 4154
## 4 Boulder MT 59632 46.2306 -112.1138 1939
## 5 Doe Hill VA 24433 38.4207 -79.4629 99
## 6 Dublin PA 18917 40.3750 -75.2045 2158
## job dob trans_num
## 1 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9
## 2 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99
## 3 Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95
## 4 Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81
## 5 Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46
## 6 Transport planner 1961-06-19 189a841a0a8ba03058526bcfe566aab5
## unix_time merch_lat merch_long is_fraud trans_date
## 1 1325376018 36.01129 -82.04832 0 2019-01-01
## 2 1325376044 49.15905 -118.18646 0 2019-01-01
## 3 1325376051 43.15070 -112.15448 0 2019-01-01
## 4 1325376076 47.03433 -112.56107 0 2019-01-01
## 5 1325376186 38.67500 -78.63246 0 2019-01-01
## 6 1325376248 40.65338 -76.15267 0 2019-01-01
df_test$trans_date <- as.POSIXct( df_test$trans_date_trans_time, format="%Y-%m-%d" )
df_test$dob <- as.POSIXct( df_test$dob, format="%Y-%m-%d" )
head(df_test)
## X trans_date_trans_time cc_num merchant
## 1 0 2020-06-21 12:14:25 2.291164e+15 fraud_Kirlin and Sons
## 2 1 2020-06-21 12:14:33 3.573030e+15 fraud_Sporer-Keebler
## 3 2 2020-06-21 12:14:53 3.598215e+15 fraud_Swaniawski, Nitzsche and Welch
## 4 3 2020-06-21 12:15:15 3.591920e+15 fraud_Haley Group
## 5 4 2020-06-21 12:15:17 3.526826e+15 fraud_Johnston-Casper
## 6 5 2020-06-21 12:15:37 3.040768e+13 fraud_Daugherty LLC
## category amt first last gender street
## 1 personal_care 2.86 Jeff Elliott M 351 Darlene Green
## 2 personal_care 29.84 Joanne Williams F 3638 Marsh Union
## 3 health_fitness 41.28 Ashley Lopez F 9333 Valentine Point
## 4 misc_pos 60.05 Brian Williams M 32941 Krystal Mill Apt. 552
## 5 travel 3.19 Nathan Massey M 5783 Evan Roads Apt. 465
## 6 kids_pets 19.55 Danielle Evans F 76752 David Lodge Apt. 064
## city state zip lat long city_pop job
## 1 Columbia SC 29209 33.9659 -80.9355 333497 Mechanical engineer
## 2 Altonah UT 84002 40.3207 -110.4360 302 Sales professional, IT
## 3 Bellmore NY 11710 40.6729 -73.5365 34496 Librarian, public
## 4 Titusville FL 32780 28.5697 -80.8191 54767 Set designer
## 5 Falmouth MI 49632 44.2529 -85.0170 1126 Furniture designer
## 6 Breesport NY 14816 42.1939 -76.7361 520 Psychotherapist
## dob trans_num unix_time merch_lat merch_long
## 1 1968-03-19 2da90c7d74bd46a0caf3777415b3ebd3 1371816865 33.98639 -81.20071
## 2 1990-01-17 324cc204407e99f51b0d6ca0055005e7 1371816873 39.45050 -109.96043
## 3 1970-10-21 c81755dbbbea9d5c77f094348a7579be 1371816893 40.49581 -74.19611
## 4 1987-07-25 2159175b9efe66dc301f149d3d5abf8c 1371816915 28.81240 -80.88306
## 5 1955-07-06 57ff021bd3f328f8738bb535c302a31b 1371816917 44.95915 -85.88473
## 6 1991-10-13 798db04aaceb4febd084f1a7c404da93 1371816937 41.74716 -77.58420
## is_fraud trans_date
## 1 0 2020-06-21
## 2 0 2020-06-21
## 3 0 2020-06-21
## 4 0 2020-06-21
## 5 0 2020-06-21
## 6 0 2020-06-21
df_train1 <- df_train[,!names(df_train) %in% c("X",'cc_num','merchant','first','last','street','zip','trans_num','unix_time')]
head(df_train1)
## trans_date_trans_time category amt gender city state
## 1 2019-01-01 00:00:18 misc_net 4.97 F Moravian Falls NC
## 2 2019-01-01 00:00:44 grocery_pos 107.23 F Orient WA
## 3 2019-01-01 00:00:51 entertainment 220.11 M Malad City ID
## 4 2019-01-01 00:01:16 gas_transport 45.00 M Boulder MT
## 5 2019-01-01 00:03:06 misc_pos 41.96 M Doe Hill VA
## 6 2019-01-01 00:04:08 gas_transport 94.63 F Dublin PA
## lat long city_pop job dob
## 1 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09
## 2 48.8878 -118.2105 149 Special educational needs teacher 1978-06-21
## 3 42.1808 -112.2620 4154 Nature conservation officer 1962-01-19
## 4 46.2306 -112.1138 1939 Patent attorney 1967-01-12
## 5 38.4207 -79.4629 99 Dance movement psychotherapist 1986-03-28
## 6 40.3750 -75.2045 2158 Transport planner 1961-06-19
## merch_lat merch_long is_fraud trans_date
## 1 36.01129 -82.04832 0 2019-01-01
## 2 49.15905 -118.18646 0 2019-01-01
## 3 43.15070 -112.15448 0 2019-01-01
## 4 47.03433 -112.56107 0 2019-01-01
## 5 38.67500 -78.63246 0 2019-01-01
## 6 40.65338 -76.15267 0 2019-01-01
str(df_train1)
## 'data.frame': 1296675 obs. of 15 variables:
## $ trans_date_trans_time: chr "2019-01-01 00:00:18" "2019-01-01 00:00:44" "2019-01-01 00:00:51" "2019-01-01 00:01:16" ...
## $ category : chr "misc_net" "grocery_pos" "entertainment" "gas_transport" ...
## $ amt : num 4.97 107.23 220.11 45 41.96 ...
## $ gender : chr "F" "F" "M" "M" ...
## $ city : chr "Moravian Falls" "Orient" "Malad City" "Boulder" ...
## $ state : chr "NC" "WA" "ID" "MT" ...
## $ lat : num 36.1 48.9 42.2 46.2 38.4 ...
## $ long : num -81.2 -118.2 -112.3 -112.1 -79.5 ...
## $ city_pop : int 3495 149 4154 1939 99 2158 2691 6018 1472 151785 ...
## $ job : chr "Psychologist, counselling" "Special educational needs teacher" "Nature conservation officer" "Patent attorney" ...
## $ dob : POSIXct, format: "1988-03-09" "1978-06-21" ...
## $ merch_lat : num 36 49.2 43.2 47 38.7 ...
## $ merch_long : num -82 -118.2 -112.2 -112.6 -78.6 ...
## $ is_fraud : int 0 0 0 0 0 0 0 0 0 0 ...
## $ trans_date : POSIXct, format: "2019-01-01" "2019-01-01" ...
df_test1 <- df_test[,!names(df_test) %in% c("X",'cc_num','merchant','first','last','street','zip','trans_num','unix_time')]
head(df_test1)
## trans_date_trans_time category amt gender city state lat
## 1 2020-06-21 12:14:25 personal_care 2.86 M Columbia SC 33.9659
## 2 2020-06-21 12:14:33 personal_care 29.84 F Altonah UT 40.3207
## 3 2020-06-21 12:14:53 health_fitness 41.28 F Bellmore NY 40.6729
## 4 2020-06-21 12:15:15 misc_pos 60.05 M Titusville FL 28.5697
## 5 2020-06-21 12:15:17 travel 3.19 M Falmouth MI 44.2529
## 6 2020-06-21 12:15:37 kids_pets 19.55 F Breesport NY 42.1939
## long city_pop job dob merch_lat merch_long
## 1 -80.9355 333497 Mechanical engineer 1968-03-19 33.98639 -81.20071
## 2 -110.4360 302 Sales professional, IT 1990-01-17 39.45050 -109.96043
## 3 -73.5365 34496 Librarian, public 1970-10-21 40.49581 -74.19611
## 4 -80.8191 54767 Set designer 1987-07-25 28.81240 -80.88306
## 5 -85.0170 1126 Furniture designer 1955-07-06 44.95915 -85.88473
## 6 -76.7361 520 Psychotherapist 1991-10-13 41.74716 -77.58420
## is_fraud trans_date
## 1 0 2020-06-21
## 2 0 2020-06-21
## 3 0 2020-06-21
## 4 0 2020-06-21
## 5 0 2020-06-21
## 6 0 2020-06-21
str(df_test1)
## 'data.frame': 555719 obs. of 15 variables:
## $ trans_date_trans_time: chr "2020-06-21 12:14:25" "2020-06-21 12:14:33" "2020-06-21 12:14:53" "2020-06-21 12:15:15" ...
## $ category : chr "personal_care" "personal_care" "health_fitness" "misc_pos" ...
## $ amt : num 2.86 29.84 41.28 60.05 3.19 ...
## $ gender : chr "M" "F" "F" "M" ...
## $ city : chr "Columbia" "Altonah" "Bellmore" "Titusville" ...
## $ state : chr "SC" "UT" "NY" "FL" ...
## $ lat : num 34 40.3 40.7 28.6 44.3 ...
## $ long : num -80.9 -110.4 -73.5 -80.8 -85 ...
## $ city_pop : int 333497 302 34496 54767 1126 520 1139 343 3688 263 ...
## $ job : chr "Mechanical engineer" "Sales professional, IT" "Librarian, public" "Set designer" ...
## $ dob : POSIXct, format: "1968-03-19" "1990-01-17" ...
## $ merch_lat : num 34 39.5 40.5 28.8 45 ...
## $ merch_long : num -81.2 -110 -74.2 -80.9 -85.9 ...
## $ is_fraud : int 0 0 0 0 0 0 0 0 0 0 ...
## $ trans_date : POSIXct, format: "2020-06-21" "2020-06-21" ...
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
classes_train=df_train1 %>% count(is_fraud)
classes_test=df_test1 %>% count(is_fraud)
normal_share_train=classes_train$n[1]/nrow(df_train1) * 100
head(normal_share_train)
## [1] 99.42113
fraud_share_train=classes_train$n[2]/nrow(df_train1) * 100
head(fraud_share_train)
## [1] 0.5788652
normal_share_test=classes_test$n[1]/nrow(df_test1) * 100
head(normal_share_test)
## [1] 99.61401
fraud_share_test=classes_test$n[2]/nrow(df_test1) * 100
head(fraud_share_test)
## [1] 0.3859864
#install.packages("ggplot2")
library(ggplot2)
Train_data=cbind(normal_share_train, fraud_share_train)
barplot(Train_data,beside=T,ylab="Number of transactions",main="Train_data",col=c("darkgreen","red"))
Test_data=cbind(normal_share_test, fraud_share_test)
barplot(Test_data,beside=T,ylab="Number of transactions",main="Test_data",col=c("darkgreen","red"))
df_train1$age_at_trans <- (df_train1$trans_date - df_train1$dob)
head(df_train1)
## trans_date_trans_time category amt gender city state
## 1 2019-01-01 00:00:18 misc_net 4.97 F Moravian Falls NC
## 2 2019-01-01 00:00:44 grocery_pos 107.23 F Orient WA
## 3 2019-01-01 00:00:51 entertainment 220.11 M Malad City ID
## 4 2019-01-01 00:01:16 gas_transport 45.00 M Boulder MT
## 5 2019-01-01 00:03:06 misc_pos 41.96 M Doe Hill VA
## 6 2019-01-01 00:04:08 gas_transport 94.63 F Dublin PA
## lat long city_pop job dob
## 1 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09
## 2 48.8878 -118.2105 149 Special educational needs teacher 1978-06-21
## 3 42.1808 -112.2620 4154 Nature conservation officer 1962-01-19
## 4 46.2306 -112.1138 1939 Patent attorney 1967-01-12
## 5 38.4207 -79.4629 99 Dance movement psychotherapist 1986-03-28
## 6 40.3750 -75.2045 2158 Transport planner 1961-06-19
## merch_lat merch_long is_fraud trans_date age_at_trans
## 1 36.01129 -82.04832 0 2019-01-01 11255 days
## 2 49.15905 -118.18646 0 2019-01-01 14804 days
## 3 43.15070 -112.15448 0 2019-01-01 20801 days
## 4 47.03433 -112.56107 0 2019-01-01 18982 days
## 5 38.67500 -78.63246 0 2019-01-01 11967 days
## 6 40.65338 -76.15267 0 2019-01-01 21015 days
df_test1$age_at_trans <- (df_test1$trans_date - df_test1$dob)
head(df_test1)
## trans_date_trans_time category amt gender city state lat
## 1 2020-06-21 12:14:25 personal_care 2.86 M Columbia SC 33.9659
## 2 2020-06-21 12:14:33 personal_care 29.84 F Altonah UT 40.3207
## 3 2020-06-21 12:14:53 health_fitness 41.28 F Bellmore NY 40.6729
## 4 2020-06-21 12:15:15 misc_pos 60.05 M Titusville FL 28.5697
## 5 2020-06-21 12:15:17 travel 3.19 M Falmouth MI 44.2529
## 6 2020-06-21 12:15:37 kids_pets 19.55 F Breesport NY 42.1939
## long city_pop job dob merch_lat merch_long
## 1 -80.9355 333497 Mechanical engineer 1968-03-19 33.98639 -81.20071
## 2 -110.4360 302 Sales professional, IT 1990-01-17 39.45050 -109.96043
## 3 -73.5365 34496 Librarian, public 1970-10-21 40.49581 -74.19611
## 4 -80.8191 54767 Set designer 1987-07-25 28.81240 -80.88306
## 5 -85.0170 1126 Furniture designer 1955-07-06 44.95915 -85.88473
## 6 -76.7361 520 Psychotherapist 1991-10-13 41.74716 -77.58420
## is_fraud trans_date age_at_trans
## 1 0 2020-06-21 19087 days
## 2 0 2020-06-21 11113 days
## 3 0 2020-06-21 18141 days
## 4 0 2020-06-21 12020 days
## 5 0 2020-06-21 23727 days
## 6 0 2020-06-21 10479 days
str(df_test1)
## 'data.frame': 555719 obs. of 16 variables:
## $ trans_date_trans_time: chr "2020-06-21 12:14:25" "2020-06-21 12:14:33" "2020-06-21 12:14:53" "2020-06-21 12:15:15" ...
## $ category : chr "personal_care" "personal_care" "health_fitness" "misc_pos" ...
## $ amt : num 2.86 29.84 41.28 60.05 3.19 ...
## $ gender : chr "M" "F" "F" "M" ...
## $ city : chr "Columbia" "Altonah" "Bellmore" "Titusville" ...
## $ state : chr "SC" "UT" "NY" "FL" ...
## $ lat : num 34 40.3 40.7 28.6 44.3 ...
## $ long : num -80.9 -110.4 -73.5 -80.8 -85 ...
## $ city_pop : int 333497 302 34496 54767 1126 520 1139 343 3688 263 ...
## $ job : chr "Mechanical engineer" "Sales professional, IT" "Librarian, public" "Set designer" ...
## $ dob : POSIXct, format: "1968-03-19" "1990-01-17" ...
## $ merch_lat : num 34 39.5 40.5 28.8 45 ...
## $ merch_long : num -81.2 -110 -74.2 -80.9 -85.9 ...
## $ is_fraud : int 0 0 0 0 0 0 0 0 0 0 ...
## $ trans_date : POSIXct, format: "2020-06-21" "2020-06-21" ...
## $ age_at_trans : 'difftime' num 19087 11113 18141 12020 ...
## ..- attr(*, "units")= chr "days"
df_train1$trans_month <- format(df_train1$trans_date,"%m")
head(df_train1)
## trans_date_trans_time category amt gender city state
## 1 2019-01-01 00:00:18 misc_net 4.97 F Moravian Falls NC
## 2 2019-01-01 00:00:44 grocery_pos 107.23 F Orient WA
## 3 2019-01-01 00:00:51 entertainment 220.11 M Malad City ID
## 4 2019-01-01 00:01:16 gas_transport 45.00 M Boulder MT
## 5 2019-01-01 00:03:06 misc_pos 41.96 M Doe Hill VA
## 6 2019-01-01 00:04:08 gas_transport 94.63 F Dublin PA
## lat long city_pop job dob
## 1 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09
## 2 48.8878 -118.2105 149 Special educational needs teacher 1978-06-21
## 3 42.1808 -112.2620 4154 Nature conservation officer 1962-01-19
## 4 46.2306 -112.1138 1939 Patent attorney 1967-01-12
## 5 38.4207 -79.4629 99 Dance movement psychotherapist 1986-03-28
## 6 40.3750 -75.2045 2158 Transport planner 1961-06-19
## merch_lat merch_long is_fraud trans_date age_at_trans trans_month
## 1 36.01129 -82.04832 0 2019-01-01 11255 days 01
## 2 49.15905 -118.18646 0 2019-01-01 14804 days 01
## 3 43.15070 -112.15448 0 2019-01-01 20801 days 01
## 4 47.03433 -112.56107 0 2019-01-01 18982 days 01
## 5 38.67500 -78.63246 0 2019-01-01 11967 days 01
## 6 40.65338 -76.15267 0 2019-01-01 21015 days 01
df_test1$trans_month <- format(df_test1$trans_date,"%m")
head(df_test1)
## trans_date_trans_time category amt gender city state lat
## 1 2020-06-21 12:14:25 personal_care 2.86 M Columbia SC 33.9659
## 2 2020-06-21 12:14:33 personal_care 29.84 F Altonah UT 40.3207
## 3 2020-06-21 12:14:53 health_fitness 41.28 F Bellmore NY 40.6729
## 4 2020-06-21 12:15:15 misc_pos 60.05 M Titusville FL 28.5697
## 5 2020-06-21 12:15:17 travel 3.19 M Falmouth MI 44.2529
## 6 2020-06-21 12:15:37 kids_pets 19.55 F Breesport NY 42.1939
## long city_pop job dob merch_lat merch_long
## 1 -80.9355 333497 Mechanical engineer 1968-03-19 33.98639 -81.20071
## 2 -110.4360 302 Sales professional, IT 1990-01-17 39.45050 -109.96043
## 3 -73.5365 34496 Librarian, public 1970-10-21 40.49581 -74.19611
## 4 -80.8191 54767 Set designer 1987-07-25 28.81240 -80.88306
## 5 -85.0170 1126 Furniture designer 1955-07-06 44.95915 -85.88473
## 6 -76.7361 520 Psychotherapist 1991-10-13 41.74716 -77.58420
## is_fraud trans_date age_at_trans trans_month
## 1 0 2020-06-21 19087 days 06
## 2 0 2020-06-21 11113 days 06
## 3 0 2020-06-21 18141 days 06
## 4 0 2020-06-21 12020 days 06
## 5 0 2020-06-21 23727 days 06
## 6 0 2020-06-21 10479 days 06
str(df_test1)
## 'data.frame': 555719 obs. of 17 variables:
## $ trans_date_trans_time: chr "2020-06-21 12:14:25" "2020-06-21 12:14:33" "2020-06-21 12:14:53" "2020-06-21 12:15:15" ...
## $ category : chr "personal_care" "personal_care" "health_fitness" "misc_pos" ...
## $ amt : num 2.86 29.84 41.28 60.05 3.19 ...
## $ gender : chr "M" "F" "F" "M" ...
## $ city : chr "Columbia" "Altonah" "Bellmore" "Titusville" ...
## $ state : chr "SC" "UT" "NY" "FL" ...
## $ lat : num 34 40.3 40.7 28.6 44.3 ...
## $ long : num -80.9 -110.4 -73.5 -80.8 -85 ...
## $ city_pop : int 333497 302 34496 54767 1126 520 1139 343 3688 263 ...
## $ job : chr "Mechanical engineer" "Sales professional, IT" "Librarian, public" "Set designer" ...
## $ dob : POSIXct, format: "1968-03-19" "1990-01-17" ...
## $ merch_lat : num 34 39.5 40.5 28.8 45 ...
## $ merch_long : num -81.2 -110 -74.2 -80.9 -85.9 ...
## $ is_fraud : int 0 0 0 0 0 0 0 0 0 0 ...
## $ trans_date : POSIXct, format: "2020-06-21" "2020-06-21" ...
## $ age_at_trans : 'difftime' num 19087 11113 18141 12020 ...
## ..- attr(*, "units")= chr "days"
## $ trans_month : chr "06" "06" "06" "06" ...
apply(df_train1, 2, function(x) length(unique(x)))
## trans_date_trans_time category amt
## 1274791 14 52928
## gender city state
## 2 894 51
## lat long city_pop
## 968 969 879
## job dob merch_lat
## 494 968 917042
## merch_long is_fraud trans_date
## 1106872 2 537
## age_at_trans trans_month
## 30424 12
apply(df_test1, 2, function(x) length(unique(x)))
## trans_date_trans_time category amt
## 544760 14 37256
## gender city state
## 2 849 50
## lat long city_pop
## 910 910 835
## job dob merch_lat
## 478 910 474151
## merch_long is_fraud trans_date
## 518539 2 194
## age_at_trans trans_month
## 27866 7
df_train2 <- df_train1[,!names(df_train1) %in% c('trans_date_trans_time','city','lat','long','job','dob','merch_lat','merch_long','trans_date')]
head(df_train2)
## category amt gender state city_pop is_fraud age_at_trans trans_month
## 1 misc_net 4.97 F NC 3495 0 11255 days 01
## 2 grocery_pos 107.23 F WA 149 0 14804 days 01
## 3 entertainment 220.11 M ID 4154 0 20801 days 01
## 4 gas_transport 45.00 M MT 1939 0 18982 days 01
## 5 misc_pos 41.96 M VA 99 0 11967 days 01
## 6 gas_transport 94.63 F PA 2158 0 21015 days 01
df_test2 <- df_test1[,!names(df_test1) %in% c('trans_date_trans_time','city','lat','long','job','dob','merch_lat','merch_long','trans_date')]
head(df_test2)
## category amt gender state city_pop is_fraud age_at_trans trans_month
## 1 personal_care 2.86 M SC 333497 0 19087 days 06
## 2 personal_care 29.84 F UT 302 0 11113 days 06
## 3 health_fitness 41.28 F NY 34496 0 18141 days 06
## 4 misc_pos 60.05 M FL 54767 0 12020 days 06
## 5 travel 3.19 M MI 1126 0 23727 days 06
## 6 kids_pets 19.55 F NY 520 0 10479 days 06
df_train2$gender<-ifelse(df_train2$gender=="M",1,0)
head(df_train2)
## category amt gender state city_pop is_fraud age_at_trans trans_month
## 1 misc_net 4.97 0 NC 3495 0 11255 days 01
## 2 grocery_pos 107.23 0 WA 149 0 14804 days 01
## 3 entertainment 220.11 1 ID 4154 0 20801 days 01
## 4 gas_transport 45.00 1 MT 1939 0 18982 days 01
## 5 misc_pos 41.96 1 VA 99 0 11967 days 01
## 6 gas_transport 94.63 0 PA 2158 0 21015 days 01
df_test2$gender<-ifelse(df_test2$gender=="M",1,0)
head(df_test2)
## category amt gender state city_pop is_fraud age_at_trans trans_month
## 1 personal_care 2.86 1 SC 333497 0 19087 days 06
## 2 personal_care 29.84 0 UT 302 0 11113 days 06
## 3 health_fitness 41.28 0 NY 34496 0 18141 days 06
## 4 misc_pos 60.05 1 FL 54767 0 12020 days 06
## 5 travel 3.19 1 MI 1126 0 23727 days 06
## 6 kids_pets 19.55 0 NY 520 0 10479 days 06
#install.packages("Hmisc")
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(df_test2)
## df_test2
##
## 8 Variables 555719 Observations
## --------------------------------------------------------------------------------
## category
## n missing distinct
## 555719 0 14
##
## lowest : entertainment food_dining gas_transport grocery_net grocery_pos
## highest: misc_pos personal_care shopping_net shopping_pos travel
##
## entertainment (40104, 0.072), food_dining (39268, 0.071), gas_transport (56370,
## 0.101), grocery_net (19426, 0.035), grocery_pos (52553, 0.095), health_fitness
## (36674, 0.066), home (52345, 0.094), kids_pets (48692, 0.088), misc_net (27367,
## 0.049), misc_pos (34574, 0.062), personal_care (39327, 0.071), shopping_net
## (41779, 0.075), shopping_pos (49791, 0.090), travel (17449, 0.031)
## --------------------------------------------------------------------------------
## amt
## n missing distinct Info Mean Gmd .05 .10
## 555719 0 37256 1 69.39 80.26 2.43 4.08
## .25 .50 .75 .90 .95
## 9.63 47.29 83.01 135.55 193.05
##
## lowest : 1.00 1.01 1.02 1.03 1.04
## highest: 16339.26 16837.08 19364.91 21437.71 22768.11
## --------------------------------------------------------------------------------
## gender
## n missing distinct Info Sum Mean Gmd
## 555719 0 2 0.743 250833 0.4514 0.4953
##
## --------------------------------------------------------------------------------
## state
## n missing distinct
## 555719 0 50
##
## lowest : AK AL AR AZ CA, highest: VT WA WI WV WY
## --------------------------------------------------------------------------------
## city_pop
## n missing distinct Info Mean Gmd .05 .10
## 555719 0 835 1 88222 157874 139 260
## .25 .50 .75 .90 .95
## 741 2408 19685 186140 525713
##
## lowest : 23 37 43 46 47
## highest: 1577385 1595797 2383912 2504700 2906700
## --------------------------------------------------------------------------------
## is_fraud
## n missing distinct Info Sum Mean Gmd
## 555719 0 2 0.012 2145 0.00386 0.00769
##
## --------------------------------------------------------------------------------
## age_at_trans [days]
## n missing distinct
## 555719 0 27866
##
## lowest : 5622 days 5623 days 5624 days 5625 days 5626 days
## highest: 35121 days 35122 days 35123 days 35124 days 35126 days
## --------------------------------------------------------------------------------
## trans_month
## n missing distinct
## 555719 0 7
##
## lowest : 06 07 08 09 10, highest: 08 09 10 11 12
##
## Value 6 7 8 9 10 11 12
## Frequency 30058 85848 88759 69533 69348 72635 139538
## Proportion 0.054 0.154 0.160 0.125 0.125 0.131 0.251
## --------------------------------------------------------------------------------
#install.packages("fastDummies")
library(fastDummies)
df_train3 <- dummy_cols(df_train2, select_columns = "category")
head(df_train3)
## category amt gender state city_pop is_fraud age_at_trans trans_month
## 1 misc_net 4.97 0 NC 3495 0 11255 days 01
## 2 grocery_pos 107.23 0 WA 149 0 14804 days 01
## 3 entertainment 220.11 1 ID 4154 0 20801 days 01
## 4 gas_transport 45.00 1 MT 1939 0 18982 days 01
## 5 misc_pos 41.96 1 VA 99 0 11967 days 01
## 6 gas_transport 94.63 0 PA 2158 0 21015 days 01
## category_entertainment category_food_dining category_gas_transport
## 1 0 0 0
## 2 0 0 0
## 3 1 0 0
## 4 0 0 1
## 5 0 0 0
## 6 0 0 1
## category_grocery_net category_grocery_pos category_health_fitness
## 1 0 0 0
## 2 0 1 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## category_home category_kids_pets category_misc_net category_misc_pos
## 1 0 0 1 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 1
## 6 0 0 0 0
## category_personal_care category_shopping_net category_shopping_pos
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## category_travel
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
df_test3 <- dummy_cols(df_test2, select_columns = "category")
head(df_test3)
## category amt gender state city_pop is_fraud age_at_trans trans_month
## 1 personal_care 2.86 1 SC 333497 0 19087 days 06
## 2 personal_care 29.84 0 UT 302 0 11113 days 06
## 3 health_fitness 41.28 0 NY 34496 0 18141 days 06
## 4 misc_pos 60.05 1 FL 54767 0 12020 days 06
## 5 travel 3.19 1 MI 1126 0 23727 days 06
## 6 kids_pets 19.55 0 NY 520 0 10479 days 06
## category_entertainment category_food_dining category_gas_transport
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## category_grocery_net category_grocery_pos category_health_fitness
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## category_home category_kids_pets category_misc_net category_misc_pos
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 1
## 5 0 0 0 0
## 6 0 1 0 0
## category_personal_care category_shopping_net category_shopping_pos
## 1 1 0 0
## 2 1 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## category_travel
## 1 0
## 2 0
## 3 0
## 4 0
## 5 1
## 6 0
df_train4 <- df_train3[,!names(df_train3) %in% c("state")]
head(df_train4)
## category amt gender city_pop is_fraud age_at_trans trans_month
## 1 misc_net 4.97 0 3495 0 11255 days 01
## 2 grocery_pos 107.23 0 149 0 14804 days 01
## 3 entertainment 220.11 1 4154 0 20801 days 01
## 4 gas_transport 45.00 1 1939 0 18982 days 01
## 5 misc_pos 41.96 1 99 0 11967 days 01
## 6 gas_transport 94.63 0 2158 0 21015 days 01
## category_entertainment category_food_dining category_gas_transport
## 1 0 0 0
## 2 0 0 0
## 3 1 0 0
## 4 0 0 1
## 5 0 0 0
## 6 0 0 1
## category_grocery_net category_grocery_pos category_health_fitness
## 1 0 0 0
## 2 0 1 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## category_home category_kids_pets category_misc_net category_misc_pos
## 1 0 0 1 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 1
## 6 0 0 0 0
## category_personal_care category_shopping_net category_shopping_pos
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## category_travel
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
df_test4 <- df_test3[,!names(df_test3) %in% c("state")]
head(df_test4)
## category amt gender city_pop is_fraud age_at_trans trans_month
## 1 personal_care 2.86 1 333497 0 19087 days 06
## 2 personal_care 29.84 0 302 0 11113 days 06
## 3 health_fitness 41.28 0 34496 0 18141 days 06
## 4 misc_pos 60.05 1 54767 0 12020 days 06
## 5 travel 3.19 1 1126 0 23727 days 06
## 6 kids_pets 19.55 0 520 0 10479 days 06
## category_entertainment category_food_dining category_gas_transport
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## category_grocery_net category_grocery_pos category_health_fitness
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## category_home category_kids_pets category_misc_net category_misc_pos
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 1
## 5 0 0 0 0
## 6 0 1 0 0
## category_personal_care category_shopping_net category_shopping_pos
## 1 1 0 0
## 2 1 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## category_travel
## 1 0
## 2 0
## 3 0
## 4 0
## 5 1
## 6 0
X_train = df_train4[,!names(df_train4) %in% c("is_fraud")]
y_train = df_train4$is_fraud
X_test = df_test4[,!names(df_test4) %in% c("is_fraud")]
y_test = df_test4$is_fraud
#library(randomForest)
#rf = randomForest(x = X_train,
# y = y_train,n_estimators=10, max_depth=5, max_features=12, min_samples_leaf=100, random_state=100, oob_score=True)
#head(rf)
#install.packages("pROC")
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc1=roc(X_train[, 3], y_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc1)
roc1$auc
## Area under the curve: 0.5006
library(rpart)
orig_fit <- rpart(is_fraud ~ ., data = df_train4)
pred_orig <- predict(orig_fit, newdata = df_test4, method = "class")
roc(df_test4$is_fraud, pred_orig, plotit = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = df_test4$is_fraud, predictor = pred_orig, plotit = TRUE)
##
## Data: pred_orig in 553574 controls (df_test4$is_fraud 0) < 2145 cases (df_test4$is_fraud 1).
## Area under the curve: 0.9281
library(pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
stat.desc(df_train4)
## category amt gender city_pop is_fraud
## nbr.val NA 1.296675e+06 1.296675e+06 1.296675e+06 1.296675e+06
## nbr.null NA 0.000000e+00 7.098630e+05 0.000000e+00 1.289169e+06
## nbr.na NA 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## min NA 1.000000e+00 0.000000e+00 2.300000e+01 0.000000e+00
## max NA 2.894890e+04 1.000000e+00 2.906700e+06 1.000000e+00
## range NA 2.894790e+04 1.000000e+00 2.906677e+06 1.000000e+00
## sum NA 9.122243e+07 5.868120e+05 1.151764e+11 7.506000e+03
## median NA 4.752000e+01 0.000000e+00 2.456000e+03 0.000000e+00
## mean NA 7.035104e+01 4.525513e-01 8.882444e+04 5.788652e-03
## SE.mean NA 1.407866e-01 4.371095e-04 2.651726e+02 6.662123e-05
## CI.mean NA 2.759370e-01 8.567196e-04 5.197292e+02 1.305753e-04
## var NA 2.570123e+04 2.477488e-01 9.117764e+10 5.755148e-03
## std.dev NA 1.603160e+02 4.977437e-01 3.019564e+05 7.586269e-02
## coef.var NA 2.278801e+00 1.099861e+00 3.399474e+00 1.310542e+01
## age_at_trans trans_month category_entertainment category_food_dining
## nbr.val 1.296675e+06 NA 1.296675e+06 1.296675e+06
## nbr.null 0.000000e+00 NA 1.202661e+06 1.205214e+06
## nbr.na 0.000000e+00 NA 0.000000e+00 0.000000e+00
## min 5.085000e+03 NA 0.000000e+00 0.000000e+00
## max 3.493200e+04 NA 1.000000e+00 1.000000e+00
## range 2.984700e+04 NA 1.000000e+00 1.000000e+00
## sum 2.178438e+10 NA 9.401400e+04 9.146100e+04
## median 1.606000e+04 NA 0.000000e+00 0.000000e+00
## mean 1.680019e+04 NA 7.250390e-02 7.053502e-02
## SE.mean 5.579236e+00 NA 2.277306e-04 2.248555e-04
## CI.mean 1.093511e+01 NA 4.463441e-04 4.407091e-04
## var 4.036274e+07 NA 6.724714e-02 6.555988e-02
## std.dev 6.353168e+03 NA 2.593205e-01 2.560466e-01
## coef.var 3.781606e-01 NA 3.576642e+00 3.630064e+00
## category_gas_transport category_grocery_net category_grocery_pos
## nbr.val 1.296675e+06 1.296675e+06 1.296675e+06
## nbr.null 1.165016e+06 1.251223e+06 1.173037e+06
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00
## min 0.000000e+00 0.000000e+00 0.000000e+00
## max 1.000000e+00 1.000000e+00 1.000000e+00
## range 1.000000e+00 1.000000e+00 1.000000e+00
## sum 1.316590e+05 4.545200e+04 1.236380e+05
## median 0.000000e+00 0.000000e+00 0.000000e+00
## mean 1.015359e-01 3.505273e-02 9.535003e-02
## SE.mean 2.652435e-04 1.615092e-04 2.579202e-04
## CI.mean 5.198681e-04 3.165525e-04 5.055147e-04
## var 9.122639e-02 3.382406e-02 8.625847e-02
## std.dev 3.020371e-01 1.839132e-01 2.936979e-01
## coef.var 2.974684e+00 5.246758e+00 3.080208e+00
## category_health_fitness category_home category_kids_pets
## nbr.val 1.296675e+06 1.296675e+06 1.296675e+06
## nbr.null 1.210796e+06 1.173560e+06 1.183640e+06
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00
## min 0.000000e+00 0.000000e+00 0.000000e+00
## max 1.000000e+00 1.000000e+00 1.000000e+00
## range 1.000000e+00 1.000000e+00 1.000000e+00
## sum 8.587900e+04 1.231150e+05 1.130350e+05
## median 0.000000e+00 0.000000e+00 0.000000e+00
## mean 6.623017e-02 9.494669e-02 8.717296e-02
## SE.mean 2.183898e-04 2.574314e-04 2.477249e-04
## CI.mean 4.280366e-04 5.045568e-04 4.855324e-04
## var 6.184378e-02 8.593188e-02 7.957390e-02
## std.dev 2.486841e-01 2.931414e-01 2.820885e-01
## coef.var 3.754846e+00 3.087432e+00 3.235963e+00
## category_misc_net category_misc_pos category_personal_care
## nbr.val 1.296675e+06 1.296675e+06 1.296675e+06
## nbr.null 1.233388e+06 1.217020e+06 1.205917e+06
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00
## min 0.000000e+00 0.000000e+00 0.000000e+00
## max 1.000000e+00 1.000000e+00 1.000000e+00
## range 1.000000e+00 1.000000e+00 1.000000e+00
## sum 6.328700e+04 7.965500e+04 9.075800e+04
## median 0.000000e+00 0.000000e+00 0.000000e+00
## mean 4.880714e-02 6.143020e-02 6.999287e-02
## SE.mean 1.892172e-04 2.108671e-04 2.240550e-04
## CI.mean 3.708592e-04 4.132923e-04 4.391401e-04
## var 4.642504e-02 5.765657e-02 6.509392e-02
## std.dev 2.154647e-01 2.401178e-01 2.551351e-01
## coef.var 4.414614e+00 3.908791e+00 3.645159e+00
## category_shopping_net category_shopping_pos category_travel
## nbr.val 1.296675e+06 1.296675e+06 1.296675e+06
## nbr.null 1.199132e+06 1.180003e+06 1.256168e+06
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00
## min 0.000000e+00 0.000000e+00 0.000000e+00
## max 1.000000e+00 1.000000e+00 1.000000e+00
## range 1.000000e+00 1.000000e+00 1.000000e+00
## sum 9.754300e+04 1.166720e+05 4.050700e+04
## median 0.000000e+00 0.000000e+00 0.000000e+00
## mean 7.522548e-02 8.997783e-02 3.123913e-02
## SE.mean 2.316248e-04 2.512918e-04 1.527715e-04
## CI.mean 4.539766e-04 4.925234e-04 2.994269e-04
## var 6.956666e-02 8.188188e-02 3.026327e-02
## std.dev 2.637549e-01 2.861501e-01 1.739634e-01
## coef.var 3.506191e+00 3.180229e+00 5.568766e+00
stat.desc(df_test4)
## category amt gender city_pop is_fraud
## nbr.val NA 5.557190e+05 5.557190e+05 5.557190e+05 5.557190e+05
## nbr.null NA 0.000000e+00 3.048860e+05 0.000000e+00 5.535740e+05
## nbr.na NA 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## min NA 1.000000e+00 0.000000e+00 2.300000e+01 0.000000e+00
## max NA 2.276811e+04 1.000000e+00 2.906700e+06 1.000000e+00
## range NA 2.276711e+04 1.000000e+00 2.906677e+06 1.000000e+00
## sum NA 3.856290e+07 2.508330e+05 4.902658e+10 2.145000e+03
## median NA 4.729000e+01 0.000000e+00 2.408000e+03 0.000000e+00
## mean NA 6.939281e+01 4.513666e-01 8.822189e+04 3.859864e-03
## SE.mean NA 2.102658e-01 6.675420e-04 4.029574e+02 8.318002e-05
## CI.mean NA 4.121143e-01 1.308361e-03 7.897837e+02 1.630302e-04
## var NA 2.456929e+04 2.476352e-01 9.023469e+10 3.844973e-03
## std.dev NA 1.567459e+02 4.976296e-01 3.003909e+05 6.200784e-02
## coef.var NA 2.258821e+00 1.102495e+00 3.404947e+00 1.606477e+01
## age_at_trans trans_month category_entertainment category_food_dining
## nbr.val 5.557190e+05 NA 5.557190e+05 5.557190e+05
## nbr.null 0.000000e+00 NA 5.156150e+05 5.164510e+05
## nbr.na 0.000000e+00 NA 0.000000e+00 0.000000e+00
## min 5.622000e+03 NA 0.000000e+00 0.000000e+00
## max 3.512600e+04 NA 1.000000e+00 1.000000e+00
## range 2.950400e+04 NA 1.000000e+00 1.000000e+00
## sum 9.517154e+09 NA 4.010400e+04 3.926800e+04
## median 1.640300e+04 NA 0.000000e+00 0.000000e+00
## mean 1.712584e+04 NA 7.216597e-02 7.066161e-02
## SE.mean 8.540633e+00 NA 3.471156e-04 3.437569e-04
## CI.mean 1.673937e+01 NA 6.803356e-04 6.737527e-04
## var 4.053549e+07 NA 6.695816e-02 6.566867e-02
## std.dev 6.366749e+03 NA 2.587628e-01 2.562590e-01
## coef.var 3.717628e-01 NA 3.585662e+00 3.626566e+00
## category_gas_transport category_grocery_net category_grocery_pos
## nbr.val 5.557190e+05 5.557190e+05 5.557190e+05
## nbr.null 4.993490e+05 5.362930e+05 5.031660e+05
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00
## min 0.000000e+00 0.000000e+00 0.000000e+00
## max 1.000000e+00 1.000000e+00 1.000000e+00
## range 1.000000e+00 1.000000e+00 1.000000e+00
## sum 5.637000e+04 1.942600e+04 5.255300e+04
## median 0.000000e+00 0.000000e+00 0.000000e+00
## mean 1.014362e-01 3.495652e-02 9.456758e-02
## SE.mean 4.049894e-04 2.463827e-04 3.925292e-04
## CI.mean 7.937663e-04 4.829023e-04 7.693447e-04
## var 9.114703e-02 3.373462e-02 8.562471e-02
## std.dev 3.019057e-01 1.836699e-01 2.926170e-01
## coef.var 2.976312e+00 5.254238e+00 3.094263e+00
## category_health_fitness category_home category_kids_pets
## nbr.val 5.557190e+05 5.557190e+05 5.557190e+05
## nbr.null 5.190450e+05 5.033740e+05 5.070270e+05
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00
## min 0.000000e+00 0.000000e+00 0.000000e+00
## max 1.000000e+00 1.000000e+00 1.000000e+00
## range 1.000000e+00 1.000000e+00 1.000000e+00
## sum 3.667400e+04 5.234500e+04 4.869200e+04
## median 0.000000e+00 0.000000e+00 0.000000e+00
## mean 6.599378e-02 9.419329e-02 8.761982e-02
## SE.mean 3.330421e-04 3.918326e-04 3.792817e-04
## CI.mean 6.527520e-04 7.679794e-04 7.433801e-04
## var 6.163872e-02 8.532107e-02 7.994273e-02
## std.dev 2.482715e-01 2.920977e-01 2.827415e-01
## coef.var 3.762043e+00 3.101046e+00 3.226912e+00
## category_misc_net category_misc_pos category_personal_care
## nbr.val 5.557190e+05 5.557190e+05 5.557190e+05
## nbr.null 5.283520e+05 5.211450e+05 5.163920e+05
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00
## min 0.000000e+00 0.000000e+00 0.000000e+00
## max 1.000000e+00 1.000000e+00 1.000000e+00
## range 1.000000e+00 1.000000e+00 1.000000e+00
## sum 2.736700e+04 3.457400e+04 3.932700e+04
## median 0.000000e+00 0.000000e+00 0.000000e+00
## mean 4.924611e-02 6.221490e-02 7.076778e-02
## SE.mean 2.902637e-04 3.240199e-04 3.439954e-04
## CI.mean 5.689077e-04 6.350686e-04 6.742201e-04
## var 4.682102e-02 5.834431e-02 6.575982e-02
## std.dev 2.163816e-01 2.415457e-01 2.564368e-01
## coef.var 4.393883e+00 3.882441e+00 3.623637e+00
## category_shopping_net category_shopping_pos category_travel
## nbr.val 5.557190e+05 5.557190e+05 5.557190e+05
## nbr.null 5.139400e+05 5.059280e+05 5.382700e+05
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00
## min 0.000000e+00 0.000000e+00 0.000000e+00
## max 1.000000e+00 1.000000e+00 1.000000e+00
## range 1.000000e+00 1.000000e+00 1.000000e+00
## sum 4.177900e+04 4.979100e+04 1.744900e+04
## median 0.000000e+00 0.000000e+00 0.000000e+00
## mean 7.518008e-02 8.959744e-02 3.139896e-02
## SE.mean 3.537144e-04 3.831222e-04 2.339391e-04
## CI.mean 6.932690e-04 7.509073e-04 4.585133e-04
## var 6.952816e-02 8.156989e-02 3.041312e-02
## std.dev 2.636819e-01 2.856044e-01 1.743936e-01
## coef.var 3.507338e+00 3.187640e+00 5.554119e+00
df_train_merge = data.frame(df_train$trans_date_trans_time,df_train$amt,df_train$trans_num, df_train$is_fraud)
head(df_train_merge)
## df_train.trans_date_trans_time df_train.amt df_train.trans_num
## 1 2019-01-01 00:00:18 4.97 0b242abb623afc578575680df30655b9
## 2 2019-01-01 00:00:44 107.23 1f76529f8574734946361c461b024d99
## 3 2019-01-01 00:00:51 220.11 a1a22d70485983eac12b5b88dad1cf95
## 4 2019-01-01 00:01:16 45.00 6b849c168bdad6f867558c3793159a81
## 5 2019-01-01 00:03:06 41.96 a41d7549acf90789359a9aa5346dcb46
## 6 2019-01-01 00:04:08 94.63 189a841a0a8ba03058526bcfe566aab5
## df_train.is_fraud
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
df_train_pred=ifelse(y_train>0.16,1,0)
head(df_train_pred)
## [1] 0 0 0 0 0 0
df_train_final = cbind(df_train_merge, df_train_pred)
head(df_train_final)
## df_train.trans_date_trans_time df_train.amt df_train.trans_num
## 1 2019-01-01 00:00:18 4.97 0b242abb623afc578575680df30655b9
## 2 2019-01-01 00:00:44 107.23 1f76529f8574734946361c461b024d99
## 3 2019-01-01 00:00:51 220.11 a1a22d70485983eac12b5b88dad1cf95
## 4 2019-01-01 00:01:16 45.00 6b849c168bdad6f867558c3793159a81
## 5 2019-01-01 00:03:06 41.96 a41d7549acf90789359a9aa5346dcb46
## 6 2019-01-01 00:04:08 94.63 189a841a0a8ba03058526bcfe566aab5
## df_train.is_fraud df_train_pred
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
names(df_train_final)[names(df_train_final) == "df_train_pred"] = "is_fraud_pred"
head(df_train_final)
## df_train.trans_date_trans_time df_train.amt df_train.trans_num
## 1 2019-01-01 00:00:18 4.97 0b242abb623afc578575680df30655b9
## 2 2019-01-01 00:00:44 107.23 1f76529f8574734946361c461b024d99
## 3 2019-01-01 00:00:51 220.11 a1a22d70485983eac12b5b88dad1cf95
## 4 2019-01-01 00:01:16 45.00 6b849c168bdad6f867558c3793159a81
## 5 2019-01-01 00:03:06 41.96 a41d7549acf90789359a9aa5346dcb46
## 6 2019-01-01 00:04:08 94.63 189a841a0a8ba03058526bcfe566aab5
## df_train.is_fraud is_fraud_pred
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
library(dplyr)
df_train_final %>% count(df_train.is_fraud)
## df_train.is_fraud n
## 1 0 1289169
## 2 1 7506
df_train_final %>% count(is_fraud_pred)
## is_fraud_pred n
## 1 0 1289169
## 2 1 7506
fraud = rbind(df_train, df_test)
head(fraud)
## X trans_date_trans_time cc_num merchant
## 1 0 2019-01-01 00:00:18 2.703186e+15 fraud_Rippin, Kub and Mann
## 2 1 2019-01-01 00:00:44 6.304233e+11 fraud_Heller, Gutmann and Zieme
## 3 2 2019-01-01 00:00:51 3.885949e+13 fraud_Lind-Buckridge
## 4 3 2019-01-01 00:01:16 3.534094e+15 fraud_Kutch, Hermiston and Farrell
## 5 4 2019-01-01 00:03:06 3.755342e+14 fraud_Keeling-Crist
## 6 5 2019-01-01 00:04:08 4.767265e+15 fraud_Stroman, Hudson and Erdman
## category amt first last gender street
## 1 misc_net 4.97 Jennifer Banks F 561 Perry Cove
## 2 grocery_pos 107.23 Stephanie Gill F 43039 Riley Greens Suite 393
## 3 entertainment 220.11 Edward Sanchez M 594 White Dale Suite 530
## 4 gas_transport 45.00 Jeremy White M 9443 Cynthia Court Apt. 038
## 5 misc_pos 41.96 Tyler Garcia M 408 Bradley Rest
## 6 gas_transport 94.63 Jennifer Conner F 4655 David Island
## city state zip lat long city_pop
## 1 Moravian Falls NC 28654 36.0788 -81.1781 3495
## 2 Orient WA 99160 48.8878 -118.2105 149
## 3 Malad City ID 83252 42.1808 -112.2620 4154
## 4 Boulder MT 59632 46.2306 -112.1138 1939
## 5 Doe Hill VA 24433 38.4207 -79.4629 99
## 6 Dublin PA 18917 40.3750 -75.2045 2158
## job dob trans_num
## 1 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9
## 2 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99
## 3 Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95
## 4 Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81
## 5 Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46
## 6 Transport planner 1961-06-19 189a841a0a8ba03058526bcfe566aab5
## unix_time merch_lat merch_long is_fraud trans_date
## 1 1325376018 36.01129 -82.04832 0 2019-01-01
## 2 1325376044 49.15905 -118.18646 0 2019-01-01
## 3 1325376051 43.15070 -112.15448 0 2019-01-01
## 4 1325376076 47.03433 -112.56107 0 2019-01-01
## 5 1325376186 38.67500 -78.63246 0 2019-01-01
## 6 1325376248 40.65338 -76.15267 0 2019-01-01
avg_transactions_pm = nrow(fraud)/24
print(avg_transactions_pm)
## [1] 77183.08
Average number of transactions per month are: 77183
avg_fraudtrans_pm1 = nrow(fraud[fraud$is_fraud=='1',])/24
print(avg_fraudtrans_pm1)
## [1] 402.125
Average number of fraudulent transactions per month are: 402
avg_fraud_amt = sum(fraud$amt[which(fraud$is_fraud == 1)])/nrow(fraud[fraud$is_fraud=='1',])
print(avg_fraud_amt)
## [1] 530.6614
Average amount per fraudulent transactions is: 530.0
cost_before_model = avg_fraud_amt*avg_fraudtrans_pm1
print(cost_before_model)
## [1] 213392.2
the cost incurred based on the first point descibed above:213392.2