The Data pre-processing and visualization herein is done with the help of R Programming Language and Tableau. First, I will process non dummy variable in Section 1 and then process dummy variables in Section 2
Tableau Link to my Profile for interactive Visualization: https://public.tableau.com/app/profile/emmanuel.m.maseruka
distancefromhome - the distance from home where the transaction happened.
distancefromlast_transaction - the distance from last transaction happened.
ratiotomedianpurchaseprice - Ratio of purchased price transaction to median purchase price.
1) STEP 1
Pulling my Credit card Fraud Data set
data<-read.csv("C:\\Users\\Kevin Meng\\OneDrive\\Desktop\\1.Credit card Fraud.csv",header = TRUE)%>%as.data.frame()
2) STEP 2
Normalization of Non Dummy Attributes
range<-max(data$distance_from_home)-min(data$distance_from_home)
dist.home.norm<-(data$distance_from_home-min(data$distance_from_home))/range #normalized distance from home
range<-max(data$distance_from_last_transaction)-min(data$distance_from_last_transaction)
dist.transac.norm<-(data$distance_from_last_transaction-min(data$distance_from_last_transaction))/range #normalized distance from last transaction
range<-max(data$ratio_to_median_purchase_price)-min(data$ratio_to_median_purchase_price)
median2purchase.ratio.norm<-(data$ratio_to_median_purchase_price-min(data$ratio_to_median_purchase_price))/range #normalized ratio_to_median_purchase_price
3) STEP 3
New Dataset with Normalized Attributes
new.data<-cbind(dist.home.norm,dist.transac.norm,median2purchase.ratio.norm,data$repeat_retailer,data$used_chip,data$used_pin_number,data$online_order,data$fraud) #Creating new dataset with normalised attributes
4) STEP 4
Visualizing the Relationship between the Normalized Attributes
The Visualization below was produced by Tableau
Tableau Link to my Profile for interactive Visualistion: https://public.tableau.com/app/profile/emmanuel.m.maseruka/viz/CreditCardFraud_16558266779080/Dashboard1
repeat_retailer - Is the transaction happened from same retailer. (dummy)
used_chip - Is the transaction through chip (dummy)
used_pin_number - Is the transaction happened by using PIN number(dummy)
online_order - Is the transaction an online order (dummy)
1) STEP 1
Subdividing Attributes into different groups
repeat_retailer.fraud1<-subset(data,fraud==1, select = repeat_retailer)
repeat_retailer.1.fraud1<-subset(repeat_retailer.fraud1,repeat_retailer==1) #Repeat Retailers that were a fraud
repeat_retailer.0.fraud1<-subset(repeat_retailer.fraud1,repeat_retailer==0) #Non Repeat Retailers that were a fraud
repeat_retailer.fraud0<-subset(data,fraud==0, select = repeat_retailer)
repeat_retailer.1.fraud0<-subset(repeat_retailer.fraud0,repeat_retailer==1) #Repeat Retailers that were not fraud
repeat_retailer.0.fraud0<-subset(repeat_retailer.fraud0,repeat_retailer==0) #Non Repeat Retailers that were not fraud
fraud<-c(nrow(repeat_retailer.1.fraud1),nrow(repeat_retailer.0.fraud1),nrow(repeat_retailer.fraud1))
no.fraud<-c(nrow(repeat_retailer.1.fraud0),nrow(repeat_retailer.0.fraud0),nrow(repeat_retailer.fraud0))
totals<-c(nrow(repeat_retailer.1.fraud1)+nrow(repeat_retailer.1.fraud0),nrow(repeat_retailer.0.fraud1)+nrow(repeat_retailer.0.fraud0),nrow(repeat_retailer.fraud1)+nrow(repeat_retailer.fraud0))
x<-rbind(fraud,no.fraud,totals)
x<-as.data.frame(x)
colnames(x)<-c("Repeat Retailers","Non Repeat Retailers", "Total")
row.names(x)<-c("Fraud","No Fraud","Total")
x %>%
kbl() %>%
kable_material(c("striped", "hover"))
| Repeat Retailers | Non Repeat Retailers | Total | |
|---|---|---|---|
| Fraud | 76925 | 10478 | 87403 |
| No Fraud | 804611 | 107986 | 912597 |
| Total | 881536 | 118464 | 1000000 |
NA
used_chip.fraud1<-subset(data,fraud==1, select = used_chip)
used_chip.1.fraud1<-subset(used_chip.fraud1,used_chip==1) #Instances who Used_chip that were a fraud
used_chip.0.fraud1<-subset(used_chip.fraud1,used_chip==0) #Instances who did not Use_chip that were a fraud
used_chip.fraud0<-subset(data,fraud==0, select = used_chip)
used_chip.1.fraud0<-subset(used_chip.fraud0,used_chip==1) #Instances who Used_chip that were a fraud
used_chip.0.fraud0<-subset(used_chip.fraud0,used_chip==0) #Instances who Used_chip that were a fraud
fraud2<-c(nrow(used_chip.1.fraud1),nrow(used_chip.0.fraud1),nrow(used_chip.fraud1))
no.fraud2<-c(nrow(used_chip.1.fraud0),nrow(used_chip.0.fraud0),nrow(used_chip.fraud0))
y<-rbind(fraud2,no.fraud2)
totals1<-c(nrow(used_chip.1.fraud1)+nrow(used_chip.1.fraud0),nrow(used_chip.0.fraud1)+nrow(used_chip.0.fraud0),nrow(used_chip.fraud1)+nrow(used_chip.fraud0))
y<-rbind(fraud2,no.fraud2,totals1)
y<-as.data.frame(y)
colnames(y)<-c("Used Chip","No Chip", "Total")
row.names(y)<-c("Fraud","No Fraud","Total")
y %>%
kbl() %>%
kable_material(c("striped", "hover"))
| Used Chip | No Chip | Total | |
|---|---|---|---|
| Fraud | 22410 | 64993 | 87403 |
| No Fraud | 327989 | 584608 | 912597 |
| Total | 350399 | 649601 | 1000000 |
used_pin_number.fraud1<-subset(data,fraud==1, select = used_pin_number)
used_pin_number.1.fraud1<-subset(used_pin_number.fraud1,used_pin_number==1) #Instances who Used_pin that were a fraud
used_pin_number.0.fraud1<-subset(used_pin_number.fraud1,used_pin_number==0) #Instances who did not Use_chip that were a fraud
used_pin_number.fraud0<-subset(data,fraud==0, select = used_pin_number)
used_pin_number.1.fraud0<-subset(used_pin_number.fraud0,used_pin_number==1) #Instances who Used_chip that were not a fraud
used_pin_number.0.fraud0<-subset(used_pin_number.fraud0,used_pin_number==0) #Instances who did not Use_chip that were not a fraud
fraud3<-c(nrow(used_pin_number.1.fraud1),nrow(used_pin_number.0.fraud1),nrow(used_pin_number.fraud1))
no.fraud3<-c(nrow(used_pin_number.1.fraud0),nrow(used_pin_number.0.fraud0),nrow(used_pin_number.fraud0))
z<-rbind(fraud3,no.fraud3)
totals2<-c(nrow(used_pin_number.1.fraud1)+nrow(used_pin_number.1.fraud0),nrow(used_pin_number.0.fraud1)+nrow(used_pin_number.0.fraud0),nrow(used_pin_number.fraud1)+nrow(used_pin_number.fraud0))
z<-rbind(fraud3,no.fraud3,totals2)
z<-as.data.frame(z)
colnames(z)<-c("Used Pin","No Pin", "Total")
row.names(z)<-c("Fraud","No Fraud","Total")
z %>%
kbl() %>%
kable_material(c("striped", "hover"))
| Used Pin | No Pin | Total | |
|---|---|---|---|
| Fraud | 273 | 87130 | 87403 |
| No Fraud | 100335 | 812262 | 912597 |
| Total | 100608 | 899392 | 1000000 |
online_order.fraud1<-subset(data,fraud==1, select = online_order)
online_order.1.fraud1<-subset(online_order.fraud1,online_order==1) #Instances who Used_online purchases that were a fraud
online_order.0.fraud1<-subset(online_order.fraud1,online_order==0) #Instances who did not Use_online purchases that were a fraud
online_order.fraud0<-subset(data,fraud==0, select = online_order)
online_order.1.fraud0<-subset(online_order.fraud0,online_order==1) #Instances who Used_online purchases that were not a fraud
online_order.0.fraud0<-subset(online_order.fraud0,online_order==0) #Instances who did not Use_online purchases that were not a fraud
fraud4<-c(nrow(online_order.1.fraud1),nrow(online_order.0.fraud1),nrow(online_order.fraud1))
no.fraud4<-c(nrow(online_order.1.fraud0),nrow(online_order.0.fraud0),nrow(online_order.fraud0))
a<-rbind(fraud4,no.fraud4)
totals3<-c(nrow(online_order.1.fraud1)+nrow(online_order.1.fraud0),nrow(online_order.0.fraud1)+nrow(online_order.0.fraud0),nrow(online_order.fraud1)+nrow(online_order.fraud0))
a<-rbind(fraud4,no.fraud4,totals3)
a<-as.data.frame(a)
colnames(a)<-c("Online Order","Offline Order", "Total")
row.names(a)<-c("Fraud","No Fraud","Total")
a %>%
kbl() %>%
kable_material(c("striped", "hover"))
| Online Order | Offline Order | Total | |
|---|---|---|---|
| Fraud | 82711 | 4692 | 87403 |
| No Fraud | 567841 | 344756 | 912597 |
| Total | 650552 | 349448 | 1000000 |
2) STEP 2
Visualizing the Dummy Variables
The Visualization below was produced by Tableau
Tableau Link to my Profile for interactive Visualistion: https://public.tableau.com/app/profile/emmanuel.m.maseruka/viz/CreditCardFraudDetectionVisualisationofDummyVariables2/Dashboard1
Tableau Link to my Profile for interactive Visualiztion: https://public.tableau.com/app/profile/emmanuel.m.maseruka/viz/CreditCardFraudDetectionVisualisationofDummyVariables1/Dashboard2