Assignment 28
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
mydata <- read.csv("C:\\Users\\RISHI RAHUL\\Desktop\\DS\\11 Random Forests\\Assignment\\Fraud_check.csv")
colnames(mydata)
## [1] "Undergrad" "Marital.Status" "Taxable.Income" "City.Population"
## [5] "Work.Experience" "Urban"
TIRisky <- NULL
TIRisky <- ifelse(mydata$Taxable.Income<=30000,1,0)
mydata[,"TIRisky"] <- TIRisky
mydata$Undergrad <- as.factor(mydata$Undergrad)
mydata$Marital.Status <- as.factor(mydata$Marital.Status)
mydata$Urban <- as.factor(mydata$Urban)
mydata$TIRisky <- as.factor(mydata$TIRisky)
fraud_risky <- mydata[mydata$TIRisky == "1",]
fraud_not_risky <- mydata[mydata$TIRisky == "0",]
data_train <- rbind(fraud_risky[1:93,], fraud_not_risky[1:357,])
data_test <- rbind(fraud_risky[94:124,], fraud_not_risky[357:476,])
# Building a random forest model on training data
fit.forest <- randomForest(TIRisky~.,data=data_train, na.action=na.roughfix,importance=TRUE)
# Training accuracy
mean(data_train$TIRisky == predict(fit.forest,data_train)) # 100% accuracy
## [1] 1
# Prediction of train data
pred_train <- predict(fit.forest,data_train)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
# Confusion Matrix
confusionMatrix(data_train$TIRisky, pred_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 357 0
## 1 0 93
##
## Accuracy : 1
## 95% CI : (0.9918, 1)
## No Information Rate : 0.7933
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.7933
## Detection Rate : 0.7933
## Detection Prevalence : 0.7933
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
# Predicting test data
pred_test <- predict(fit.forest,newdata=data_test)
mean(pred_test == data_test$TIRisky) # Accuracy = 100%
## [1] 1
# Confusion Matrix
confusionMatrix(data_test$TIRisky, pred_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 120 0
## 1 0 31
##
## Accuracy : 1
## 95% CI : (0.9759, 1)
## No Information Rate : 0.7947
## P-Value [Acc > NIR] : 8.528e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.7947
## Detection Rate : 0.7947
## Detection Prevalence : 0.7947
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
# Visualization
plot(fit.forest,lwd=2)
legend("topright", colnames(fit.forest$err.rate),col=1:4,cex=0.8,fill=1:4)
