#install.packages("caret")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
# define the filename-manual procedure
filename <- "C:/Users/Lisa/OneDrive/CUNY/622/Week2 Linear_Logistic/cement.csv"
# load the CSV file from the local directory
dataset <- read.csv(filename, header=TRUE)

dim(dataset)
## [1] 279   4
dataset$Producer =as.factor(dataset$Producer)
dataset<-dataset %>% 
  select(Production, Sales, Producer) %>%
  filter(Production > 0, Sales > 0)

# create a list of 80% of the rows in the original dataset we can use for training
validation_index <- createDataPartition(dataset$Producer, p=0.80, list=FALSE)
# select 20% of the data for validation
validation <- dataset[-validation_index,]
# use the remaining 80% of data to training and testing the models
dataset <- dataset[validation_index,]

# dimensions of dataset
dim(dataset)
## [1] 142   3
colnames(dataset)
## [1] "Production" "Sales"      "Producer"
dataset$Producer =as.factor(dataset$Producer)
# list the levels for the class
levels(dataset$Producer)
## [1] "A-EAST"  "A-NORTH" "A-SOUTH" "A-WEST"
# summarize the class distribution
percentage <- prop.table(table(dataset$Producer)) * 100
cbind(freq=table(dataset$Producer), percentage=percentage)
##         freq percentage
## A-EAST    32   22.53521
## A-NORTH   40   28.16901
## A-SOUTH   32   22.53521
## A-WEST    38   26.76056
# summarize attribute distributions
summary(dataset)
##    Production          Sales            Producer 
##  Min.   : 377215   Min.   : 451312   A-EAST :32  
##  1st Qu.: 964758   1st Qu.:1077420   A-NORTH:40  
##  Median :1164372   Median :1348242   A-SOUTH:32  
##  Mean   :1350529   Mean   :1530325   A-WEST :38  
##  3rd Qu.:1621072   3rd Qu.:1730427               
##  Max.   :3242360   Max.   :3535506
# split input and output
x <- dataset[,1:2]
y <- dataset[,3]
# boxplot for each attribute on one image-ERROR
par(mfrow=c(1,2))
  for(i in 1:2) {
  boxplot(x[,i])
}

# barplot for class breakdown
plot(y)

# scatterplot matrix
featurePlot(x=x, y=y, plot="ellipse")

# box and whisker plots for each attribute
featurePlot(x=x, y=y, plot="box")

# density plots for each attribute by class value
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)

# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"
# a) linear algorithms
set.seed(7)
fit.lda <- train(Producer~., data=dataset, method="lda", metric=metric, trControl=control)
# b) nonlinear algorithms
# CART
set.seed(7)
fit.cart <- train(Producer~., data=dataset, method="rpart", metric=metric, trControl=control)
# kNN
set.seed(7)
fit.knn <- train(Producer~., data=dataset, method="knn", metric=metric, trControl=control)
# c) advanced algorithms
# SVM
set.seed(7)
fit.svm <- train(Producer~., data=dataset, method="svmRadial", metric=metric, trControl=control)
# Random Forest
set.seed(7)
fit.rf <- train(Producer~., data=dataset, method="rf", metric=metric, trControl=control)
## note: only 1 unique complexity parameters in default grid. Truncating the grid to 1 .
#Â summarize accuracy of models
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: lda, cart, knn, svm, rf 
## Number of resamples: 10 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## lda  0.1333333 0.3392857 0.3571429 0.3603663 0.4175824 0.5000000    0
## cart 0.3333333 0.3884615 0.4285714 0.4222711 0.4285714 0.5333333    0
## knn  0.3076923 0.3750000 0.4285714 0.4283883 0.4916667 0.5333333    0
## svm  0.2142857 0.4000000 0.4285714 0.4013919 0.4285714 0.4666667    0
## rf   0.2857143 0.4071429 0.4285714 0.4283883 0.4666667 0.5714286    0
## 
## Kappa 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## lda  -0.17469880 0.1179556 0.1279337 0.1272347 0.1928732 0.3146853    0
## cart  0.09090909 0.1800595 0.2222222 0.2226022 0.2393290 0.3750000    0
## knn   0.07142857 0.1596363 0.2354860 0.2321355 0.3110119 0.3750000    0
## svm  -0.04054054 0.1928197 0.2221847 0.1947038 0.2315541 0.2857143    0
## rf    0.04109589 0.2042180 0.2406692 0.2342918 0.2867327 0.4246575    0
# compare accuracy of models
dotplot(results)

# summarize Best Model
print(fit.rf)
## Random Forest 
## 
## 142 samples
##   2 predictor
##   4 classes: 'A-EAST', 'A-NORTH', 'A-SOUTH', 'A-WEST' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 127, 127, 127, 129, 128, 128, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.4283883  0.2342918
## 
## Tuning parameter 'mtry' was held constant at a value of 2
# estimate skill of LDA on the validation dataset
predictions <- predict(fit.rf, validation)
confusionMatrix(predictions, validation$Producer)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction A-EAST A-NORTH A-SOUTH A-WEST
##    A-EAST       6       1       3      3
##    A-NORTH      1       3       3      3
##    A-SOUTH      0       5       2      0
##    A-WEST       1       1       0      3
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4             
##                  95% CI : (0.2387, 0.5789)
##     No Information Rate : 0.2857          
##     P-Value [Acc > NIR] : 0.09766         
##                                           
##                   Kappa : 0.2011          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A-EAST Class: A-NORTH Class: A-SOUTH Class: A-WEST
## Sensitivity                 0.7500        0.30000        0.25000       0.33333
## Specificity                 0.7407        0.72000        0.81481       0.92308
## Pos Pred Value              0.4615        0.30000        0.28571       0.60000
## Neg Pred Value              0.9091        0.72000        0.78571       0.80000
## Prevalence                  0.2286        0.28571        0.22857       0.25714
## Detection Rate              0.1714        0.08571        0.05714       0.08571
## Detection Prevalence        0.3714        0.28571        0.20000       0.14286
## Balanced Accuracy           0.7454        0.51000        0.53241       0.62821

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.