#install.packages("caret")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
# define the filename-manual procedure
filename <- "C:/Users/Lisa/OneDrive/CUNY/622/Week2 Linear_Logistic/cement.csv"
# load the CSV file from the local directory
dataset <- read.csv(filename, header=TRUE)
dim(dataset)
## [1] 279 4
dataset$Producer =as.factor(dataset$Producer)
dataset<-dataset %>%
select(Production, Sales, Producer) %>%
filter(Production > 0, Sales > 0)
# create a list of 80% of the rows in the original dataset we can use for training
validation_index <- createDataPartition(dataset$Producer, p=0.80, list=FALSE)
# select 20% of the data for validation
validation <- dataset[-validation_index,]
# use the remaining 80% of data to training and testing the models
dataset <- dataset[validation_index,]
# dimensions of dataset
dim(dataset)
## [1] 142 3
colnames(dataset)
## [1] "Production" "Sales" "Producer"
dataset$Producer =as.factor(dataset$Producer)
# list the levels for the class
levels(dataset$Producer)
## [1] "A-EAST" "A-NORTH" "A-SOUTH" "A-WEST"
# summarize the class distribution
percentage <- prop.table(table(dataset$Producer)) * 100
cbind(freq=table(dataset$Producer), percentage=percentage)
## freq percentage
## A-EAST 32 22.53521
## A-NORTH 40 28.16901
## A-SOUTH 32 22.53521
## A-WEST 38 26.76056
# summarize attribute distributions
summary(dataset)
## Production Sales Producer
## Min. : 377215 Min. : 451312 A-EAST :32
## 1st Qu.: 964758 1st Qu.:1077420 A-NORTH:40
## Median :1164372 Median :1348242 A-SOUTH:32
## Mean :1350529 Mean :1530325 A-WEST :38
## 3rd Qu.:1621072 3rd Qu.:1730427
## Max. :3242360 Max. :3535506
# split input and output
x <- dataset[,1:2]
y <- dataset[,3]
# boxplot for each attribute on one image-ERROR
par(mfrow=c(1,2))
for(i in 1:2) {
boxplot(x[,i])
}
# barplot for class breakdown
plot(y)
# scatterplot matrix
featurePlot(x=x, y=y, plot="ellipse")
# box and whisker plots for each attribute
featurePlot(x=x, y=y, plot="box")
# density plots for each attribute by class value
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"
# a) linear algorithms
set.seed(7)
fit.lda <- train(Producer~., data=dataset, method="lda", metric=metric, trControl=control)
# b) nonlinear algorithms
# CART
set.seed(7)
fit.cart <- train(Producer~., data=dataset, method="rpart", metric=metric, trControl=control)
# kNN
set.seed(7)
fit.knn <- train(Producer~., data=dataset, method="knn", metric=metric, trControl=control)
# c) advanced algorithms
# SVM
set.seed(7)
fit.svm <- train(Producer~., data=dataset, method="svmRadial", metric=metric, trControl=control)
# Random Forest
set.seed(7)
fit.rf <- train(Producer~., data=dataset, method="rf", metric=metric, trControl=control)
## note: only 1 unique complexity parameters in default grid. Truncating the grid to 1 .
#Â summarize accuracy of models
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: lda, cart, knn, svm, rf
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.1333333 0.3392857 0.3571429 0.3603663 0.4175824 0.5000000 0
## cart 0.3333333 0.3884615 0.4285714 0.4222711 0.4285714 0.5333333 0
## knn 0.3076923 0.3750000 0.4285714 0.4283883 0.4916667 0.5333333 0
## svm 0.2142857 0.4000000 0.4285714 0.4013919 0.4285714 0.4666667 0
## rf 0.2857143 0.4071429 0.4285714 0.4283883 0.4666667 0.5714286 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda -0.17469880 0.1179556 0.1279337 0.1272347 0.1928732 0.3146853 0
## cart 0.09090909 0.1800595 0.2222222 0.2226022 0.2393290 0.3750000 0
## knn 0.07142857 0.1596363 0.2354860 0.2321355 0.3110119 0.3750000 0
## svm -0.04054054 0.1928197 0.2221847 0.1947038 0.2315541 0.2857143 0
## rf 0.04109589 0.2042180 0.2406692 0.2342918 0.2867327 0.4246575 0
# compare accuracy of models
dotplot(results)
# summarize Best Model
print(fit.rf)
## Random Forest
##
## 142 samples
## 2 predictor
## 4 classes: 'A-EAST', 'A-NORTH', 'A-SOUTH', 'A-WEST'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 127, 127, 127, 129, 128, 128, ...
## Resampling results:
##
## Accuracy Kappa
## 0.4283883 0.2342918
##
## Tuning parameter 'mtry' was held constant at a value of 2
# estimate skill of LDA on the validation dataset
predictions <- predict(fit.rf, validation)
confusionMatrix(predictions, validation$Producer)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A-EAST A-NORTH A-SOUTH A-WEST
## A-EAST 6 1 3 3
## A-NORTH 1 3 3 3
## A-SOUTH 0 5 2 0
## A-WEST 1 1 0 3
##
## Overall Statistics
##
## Accuracy : 0.4
## 95% CI : (0.2387, 0.5789)
## No Information Rate : 0.2857
## P-Value [Acc > NIR] : 0.09766
##
## Kappa : 0.2011
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A-EAST Class: A-NORTH Class: A-SOUTH Class: A-WEST
## Sensitivity 0.7500 0.30000 0.25000 0.33333
## Specificity 0.7407 0.72000 0.81481 0.92308
## Pos Pred Value 0.4615 0.30000 0.28571 0.60000
## Neg Pred Value 0.9091 0.72000 0.78571 0.80000
## Prevalence 0.2286 0.28571 0.22857 0.25714
## Detection Rate 0.1714 0.08571 0.05714 0.08571
## Detection Prevalence 0.3714 0.28571 0.20000 0.14286
## Balanced Accuracy 0.7454 0.51000 0.53241 0.62821
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.