To Prepare a model on fraud data to check on the probability of Risky Vs Good. Risky patients -Taxable Income <= 30000

install.packages("C50",repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'C50' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\tswaminathan\AppData\Local\Temp\RtmpqwQcj5\downloaded_packages

install.packages("tree",repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'tree' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\tswaminathan\AppData\Local\Temp\RtmpqwQcj5\downloaded_packages

install.packages("caret",repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'caret' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\tswaminathan\AppData\Local\Temp\RtmpqwQcj5\downloaded_packages

install.packages("gmodels",repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'gmodels' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\tswaminathan\AppData\Local\Temp\RtmpqwQcj5\downloaded_packages

install.packages("party",repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'party' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\tswaminathan\AppData\Local\Temp\RtmpqwQcj5\downloaded_packages

install.packages("knitr",repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'knitr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\tswaminathan\AppData\Local\Temp\RtmpqwQcj5\downloaded_packages

install.packages("png",repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'png' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\tswaminathan\AppData\Local\Temp\RtmpqwQcj5\downloaded_packages

library(party)

## Loading required package: grid

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(C50)
library(tree)
library(gmodels)
library(knitr)
library(png)

FraudCheck <- read.csv(file.choose())
# Splitting data into training and testing.
# splitting the data based on Sales
hist(FraudCheck$Taxable.Income)

Risky_Good = ifelse(FraudCheck$Taxable.Income<= 30000, "Risky", "Good")
FC = data.frame(FraudCheck,Risky_Good)
#CD <- CompanyData[,2:12]
# View(CD)

FC_train <- FC[1:300,]

# View(CD_train)
FC_test <- FC[301:600,]

# View(CD_test)

###Using Party Function 

png(file = "decision_tree.png")
opall_tree = ctree(Risky_Good ~ Undergrad + Marital.Status + City.Population + 
                     Work.Experience + Urban, data = FC)
summary(opall_tree)

##     Length      Class       Mode 
##          1 BinaryTree         S4

plot(opall_tree)
# From the above tree, It looks like the data has 20 % of Risky patients and 80 % good patients


# using the training Data 

png(file = "decision_tree.png")
op_tree = ctree(Risky_Good ~ Undergrad + Marital.Status + City.Population + 
                  Work.Experience + Urban, data = FC_train)
summary(op_tree)

##     Length      Class       Mode 
##          1 BinaryTree         S4

plot(op_tree)

pred_tree <- as.data.frame(predict(op_tree,newdata=FC_test))
pred_tree["final"] <- NULL
pred_test_df <- predict(op_tree,newdata=FC_test)


mean(pred_test_df==FC_test$Risky_Good) # Accuracy = 82 %

## [1] 0.82

CrossTable(FC_test$Risky_Good,pred_test_df)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  300 
## 
##  
##                    | pred_test_df 
## FC_test$Risky_Good |      Good | Row Total | 
## -------------------|-----------|-----------|
##               Good |       246 |       246 | 
##                    |     0.820 |           | 
## -------------------|-----------|-----------|
##              Risky |        54 |        54 | 
##                    |     0.180 |           | 
## -------------------|-----------|-----------|
##       Column Total |       300 |       300 | 
## -------------------|-----------|-----------|
## 
##

confusionMatrix(FC_test$Risky_Good,pred_test_df)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Good Risky
##      Good   246     0
##      Risky   54     0
##                                           
##                Accuracy : 0.82            
##                  95% CI : (0.7718, 0.8618)
##     No Information Rate : 1               
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : 5.498e-13       
##                                           
##             Sensitivity : 0.82            
##             Specificity :   NA            
##          Pos Pred Value :   NA            
##          Neg Pred Value :   NA            
##              Prevalence : 1.00            
##          Detection Rate : 0.82            
##    Detection Prevalence : 0.82            
##       Balanced Accuracy :   NA            
##                                           
##        'Positive' Class : Good            
##