Random Forest

Company dataset

Assignment 29

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

mydata <- read.csv("C:\\Users\\RISHI RAHUL\\Desktop\\DS\\11 Random Forests\\Assignment\\Company_Data.csv")

colnames(mydata)

##  [1] "Sales"       "CompPrice"   "Income"      "Advertising" "Population" 
##  [6] "Price"       "ShelveLoc"   "Age"         "Education"   "Urban"      
## [11] "US"

Sales_Result <- NULL
Sales_Result <- ifelse(mydata$Sales > 7.490,1,0)
mydata[,"Sales_Result"] <- Sales_Result

mydata$ShelveLoc <- as.factor(mydata$ShelveLoc)
mydata$Urban <- as.factor(mydata$Urban)
mydata$US <- as.factor(mydata$US)
mydata$Sales_Result <- as.factor(mydata$Sales_Result)

sales_high <- mydata[mydata$Sales_Result == "1",] 
sales_low <- mydata[mydata$Sales_Result == "0",]

data_train <- rbind(sales_high[1:150,], sales_low[1:150,])
data_test <- rbind(sales_high[151:199,], sales_low[151:201,])

# Building a random forest model on training data 
fit.forest <- randomForest(Sales_Result~.,data=data_train, na.action=na.roughfix,importance=TRUE)

# Training accuracy 
mean(data_train$Sales_Result == predict(fit.forest,data_train)) # 100% accuracy

## [1] 1

# Prediction of train data
pred_train <- predict(fit.forest,data_train)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

# Confusion Matrix
confusionMatrix(data_train$Sales_Result, pred_train)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 150   0
##          1   0 150
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9878, 1)
##     No Information Rate : 0.5        
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0        
##             Specificity : 1.0        
##          Pos Pred Value : 1.0        
##          Neg Pred Value : 1.0        
##              Prevalence : 0.5        
##          Detection Rate : 0.5        
##    Detection Prevalence : 0.5        
##       Balanced Accuracy : 1.0        
##                                      
##        'Positive' Class : 0          
##

# Predicting test data 
pred_test <- predict(fit.forest,newdata=data_test)
mean(pred_test == data_test$Sales_Result) # Accuracy = 100%

## [1] 0.99

# Confusion Matrix 

confusionMatrix(data_test$Sales_Result, pred_test)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 51  0
##          1  1 48
##                                           
##                Accuracy : 0.99            
##                  95% CI : (0.9455, 0.9997)
##     No Information Rate : 0.52            
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.98            
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9808          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9796          
##              Prevalence : 0.5200          
##          Detection Rate : 0.5100          
##    Detection Prevalence : 0.5100          
##       Balanced Accuracy : 0.9904          
##                                           
##        'Positive' Class : 0               
##

# Visualization 
plot(fit.forest,lwd=2)
legend("topright", colnames(fit.forest$err.rate),col=1:4,cex=0.8,fill=1:4)