One of the most popular machine learning predictive methods is Random Forest. It is an excellent tool for making predictions but with flexibilty comes lack of interpretation.
## Loading required package: lattice
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
Read in the data
car <- read.csv("Car.txt", header=FALSE, sep=",", na.strings="?")
colnames(car) <- c("symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_of_doors",
"body_style", "drive_wheels", "engine_location", "wheel_base", "length", "width", "height",
"curb_weight", "engine_type", "num_cylinders", "engine_size", "fuel_system", "bore",
"stroke", "compression_ration", "horsepower", "peak_rpm", "city_mpg", "highway_mpg",
"price")
car <- car[complete.cases(car),]
inTrain <- createDataPartition(y=car$fuel_type, p=0.7, list=FALSE)
training <- car[inTrain,]; testing <- car[-inTrain,]
modFit3 <- train(fuel_type ~., data=training,method="rf",prox=TRUE)
modFit3
## Random Forest
##
## 112 samples
## 25 predictor
## 2 classes: 'diesel', 'gas'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 112, 112, 112, 112, 112, 112, ...
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 0.9465533 0.6163742 0.02913985 0.2121647
## 33 1.0000000 1.0000000 0.00000000 0.0000000
## 65 1.0000000 1.0000000 0.00000000 0.0000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 33.
# predicting new values
pred <- predict(modFit3,testing);
testing$predRight <- pred == testing$fuel_type
table(pred,testing$fuel_type)
##
## pred diesel gas
## diesel 4 0
## gas 0 43
# predicting new values plot
qplot(fuel_type,price,colour=predRight,data=testing, xlab="Make", ylab="Price", main="Prediction Success")