To Prepare a model on fraud data to check on the probability of Risky Vs Good. Risky patients -Taxable Income <= 30000
install.packages("C50",repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## package 'C50' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\tswaminathan\AppData\Local\Temp\RtmpqwQcj5\downloaded_packages
install.packages("tree",repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## package 'tree' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\tswaminathan\AppData\Local\Temp\RtmpqwQcj5\downloaded_packages
install.packages("caret",repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## package 'caret' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\tswaminathan\AppData\Local\Temp\RtmpqwQcj5\downloaded_packages
install.packages("gmodels",repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## package 'gmodels' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\tswaminathan\AppData\Local\Temp\RtmpqwQcj5\downloaded_packages
install.packages("party",repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## package 'party' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\tswaminathan\AppData\Local\Temp\RtmpqwQcj5\downloaded_packages
install.packages("knitr",repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## package 'knitr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\tswaminathan\AppData\Local\Temp\RtmpqwQcj5\downloaded_packages
install.packages("png",repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## package 'png' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\tswaminathan\AppData\Local\Temp\RtmpqwQcj5\downloaded_packages
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(C50)
library(tree)
library(gmodels)
library(knitr)
library(png)
FraudCheck <- read.csv(file.choose())
# Splitting data into training and testing.
# splitting the data based on Sales
hist(FraudCheck$Taxable.Income)
Risky_Good = ifelse(FraudCheck$Taxable.Income<= 30000, "Risky", "Good")
FC = data.frame(FraudCheck,Risky_Good)
#CD <- CompanyData[,2:12]
# View(CD)
FC_train <- FC[1:300,]
# View(CD_train)
FC_test <- FC[301:600,]
# View(CD_test)
###Using Party Function
png(file = "decision_tree.png")
opall_tree = ctree(Risky_Good ~ Undergrad + Marital.Status + City.Population +
Work.Experience + Urban, data = FC)
summary(opall_tree)
## Length Class Mode
## 1 BinaryTree S4
plot(opall_tree)
# From the above tree, It looks like the data has 20 % of Risky patients and 80 % good patients
# using the training Data
png(file = "decision_tree.png")
op_tree = ctree(Risky_Good ~ Undergrad + Marital.Status + City.Population +
Work.Experience + Urban, data = FC_train)
summary(op_tree)
## Length Class Mode
## 1 BinaryTree S4
plot(op_tree)
pred_tree <- as.data.frame(predict(op_tree,newdata=FC_test))
pred_tree["final"] <- NULL
pred_test_df <- predict(op_tree,newdata=FC_test)
mean(pred_test_df==FC_test$Risky_Good) # Accuracy = 82 %
## [1] 0.82
CrossTable(FC_test$Risky_Good,pred_test_df)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 300
##
##
## | pred_test_df
## FC_test$Risky_Good | Good | Row Total |
## -------------------|-----------|-----------|
## Good | 246 | 246 |
## | 0.820 | |
## -------------------|-----------|-----------|
## Risky | 54 | 54 |
## | 0.180 | |
## -------------------|-----------|-----------|
## Column Total | 300 | 300 |
## -------------------|-----------|-----------|
##
##
confusionMatrix(FC_test$Risky_Good,pred_test_df)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Good Risky
## Good 246 0
## Risky 54 0
##
## Accuracy : 0.82
## 95% CI : (0.7718, 0.8618)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
## Mcnemar's Test P-Value : 5.498e-13
##
## Sensitivity : 0.82
## Specificity : NA
## Pos Pred Value : NA
## Neg Pred Value : NA
## Prevalence : 1.00
## Detection Rate : 0.82
## Detection Prevalence : 0.82
## Balanced Accuracy : NA
##
## 'Positive' Class : Good
##