getwd()
## [1] "C:/Users/jfole/OneDrive/Desktop/Ralph_ML"
library(tidyverse)
library(caret)
library(kernlab)
library(kernlab)
library(kableExtra)
library(lubridate)
library(RCurl)
library(data.table)
#install.packages("caret", dependencies = c("Depends", "Suggests"))
dataset<-read_csv("Johns_Weather.csv")
### Let's take a gander at the data
View(dataset)
validation_index<-createDataPartition(dataset$Hundreds , p=0.8, list = FALSE)
## Warning in createDataPartition(dataset$Hundreds, p = 0.8, list = FALSE):
## Some classes have a single record ( Twenty, UnderOne ) and these will be
## selected for the sample
validation<-dataset[-validation_index , ]
dataset<- dataset[validation_index , ]
#validation_index
control<- trainControl(method = "cv" , number = 10)
metric<- "Accuracy"
set.seed(7)
fit.cart<- train(Hundreds~., data = dataset , method="rpart" , metric=metric, trControl=control)
set.seed(7)
fit.knn<- train(Hundreds~., data = dataset,method="knn", metric=metric,trControl=control)
## Warning: predictions failed for Fold10: k=5 Error in dimnames(x) <- dn :
## length of 'dimnames' [2] not equal to array extent
## Warning: predictions failed for Fold10: k=7 Error in dimnames(x) <- dn :
## length of 'dimnames' [2] not equal to array extent
## Warning: predictions failed for Fold10: k=9 Error in dimnames(x) <- dn :
## length of 'dimnames' [2] not equal to array extent
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
set.seed(7)
fit.svm<- train(Hundreds~., data = dataset, method="svmRadial", metric=metric, trControl=control)
results<- resamples(list(lda=fit.lda,cart=fit.cart,knn=fit.knn, svm=fit.svm,rf=fit.rf ))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: lda, cart, knn, svm, rf
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.06666667 0.1294643 0.1979950 0.1868734 0.2598684 0.2666667 0
## cart 0.09523810 0.1882440 0.1952381 0.1945269 0.2078947 0.3157895 0
## knn 0.04761905 0.1052632 0.1250000 0.1519598 0.1875000 0.3684211 1
## svm 0.09523810 0.1394737 0.2005013 0.2094330 0.2968750 0.3333333 0
## rf 0.05263158 0.1104167 0.1456140 0.1548951 0.2053571 0.2631579 2
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda -0.01449275 0.03743316 0.10244594 0.09592883 0.1541003 0.1790123 0
## cart 0.02416918 0.07413698 0.10438894 0.10806757 0.1434689 0.2257053 0
## knn -0.01694915 0.02608696 0.03003003 0.07177962 0.1034483 0.2962963 1
## svm 0.01481481 0.06034943 0.10824609 0.12294821 0.1995154 0.2424242 0
## rf -0.01483680 0.02964744 0.07024257 0.07921244 0.1245021 0.1963746 2
dotplot(results)
print(fit.svm)
## Support Vector Machines with Radial Basis Function Kernel
##
## 182 samples
## 2 predictor
## 21 classes: 'Eight', 'Eighteen', 'Eleven', 'Fifteen', 'Five', 'Four', 'Fourteen', 'Nine', 'Nineteen', 'One', 'Seven', 'Seventeen', 'Six', 'Sixteen', 'Ten', 'Thirteen', 'Three', 'Twelve', 'Twenty', 'Two', 'UnderOne'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 163, 167, 161, 163, 161, 163, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.1743922 0.07363863
## 0.50 0.2094330 0.12294821
## 1.00 0.2007080 0.12222243
##
## Tuning parameter 'sigma' was held constant at a value of 7.109519
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 7.109519 and C = 0.5.
. ### We will also add in the Day of the week (1 :7) since Friday night is likely to be busier thatn Monday for going out with friends.
#Ralph_Expanded<- read_csv("Ralph_ML_Date.csv")
John_Expanded<- read_csv("John_ML_Date.csv")
## Warning: Missing column names filled in: 'X9' [9], 'X10' [10], 'X11' [11],
## 'X12' [12], 'X13' [13]
head(John_Expanded)
## # A tibble: 6 x 13
## DayNumber Date Month Weather Tempature Sales Hundreds Group X9 X10
## <int> <chr> <int> <int> <int> <dbl> <chr> <chr> <chr> <chr>
## 1 6 3/18~ 3 2 58 572. Five Good <NA> <NA>
## 2 7 3/19~ 3 2 45 616. Six Good <NA> <NA>
## 3 1 3/20~ 3 3 50 535 Five Good <NA> <NA>
## 4 2 3/21~ 3 3 50 323 Three Poor <NA> <NA>
## 5 3 3/22~ 3 3 50 376. Three Poor <NA> <NA>
## 6 4 3/23~ 3 3 70 681. Six Good <NA> <NA>
## # ... with 3 more variables: X11 <chr>, X12 <chr>, X13 <chr>
dataset\(Date<-mdy(dataset\)Date)
John_Expanded$Date<-mdy(John_Expanded$Date)
John_Select<- select(John_Expanded,DayNumber, Month,Weather,Tempature,Group )
head(John_Select)
## # A tibble: 6 x 5
## DayNumber Month Weather Tempature Group
## <int> <int> <int> <int> <chr>
## 1 6 3 2 58 Good
## 2 7 3 2 45 Good
## 3 1 3 3 50 Good
## 4 2 3 3 50 Poor
## 5 3 3 3 50 Poor
## 6 4 3 3 70 Good
validation_index_2<-createDataPartition(John_Select$Group , p=0.8, list = FALSE)
validation_2<-John_Select[-validation_index_2 , ]
dataset_2<- John_Select[validation_index_2 , ]
control<- trainControl(method = "cv" , number = 10)
metric<- "Accuracy"
set.seed(7)
fit.lda<- train(Group~., data = dataset_2, method="lda" , metric=metric , trControl=control)
set.seed(7)
fit.cart<- train(Group~., data = dataset_2 , method="rpart" , metric=metric, trControl=control)
set.seed(7)
fit.knn<- train(Group~., data = dataset_2,method="knn", metric=metric,trControl=control)
set.seed(7)
fit.svm<- train(Group~., data = dataset_2, method="svmRadial", metric=metric, trControl=control)
set.seed(7)
fit.rf<- train(Group~., data = dataset_2, method="rf", metric=metric, trControl=control)
results<- resamples(list(lda=fit.lda,cart=fit.cart,knn=fit.knn, svm=fit.svm,rf=fit.rf ))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: lda, cart, knn, svm, rf
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.2777778 0.4705882 0.5147059 0.5418301 0.6617647 0.7058824 0
## cart 0.4117647 0.5294118 0.5555556 0.5852941 0.6617647 0.7222222 0
## knn 0.4705882 0.5637255 0.6666667 0.6258170 0.7058824 0.7222222 0
## svm 0.3333333 0.5882353 0.6290850 0.6094771 0.6617647 0.7777778 0
## rf 0.4705882 0.5784314 0.6470588 0.6385621 0.6960784 0.8235294 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda -0.098591549 0.1957909 0.2568557 0.3029127 0.4789323 0.5595855 0
## cart 0.123711340 0.2803635 0.3009709 0.3702958 0.5050794 0.5964126 0
## knn 0.168478261 0.3450521 0.4999571 0.4352792 0.5543883 0.5964126 0
## svm 0.009174312 0.3711988 0.4253877 0.4092688 0.4973245 0.6651163 0
## rf 0.186170213 0.3733415 0.4687500 0.4537053 0.5441077 0.7343750 0
dotplot(results)
print(fit.knn)
## k-Nearest Neighbors
##
## 174 samples
## 4 predictor
## 3 classes: 'Good', 'Great', 'Poor'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 157, 157, 156, 157, 157, 156, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.5973856 0.3901240
## 7 0.5849673 0.3718785
## 9 0.6258170 0.4352792
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
Table<-read_csv("ResultsCompared.csv")
## Warning: Missing column names filled in: 'X1' [1]
View(Table)
getwd()
## [1] "C:/Users/jfole/OneDrive/Desktop/Ralph_ML"
kable(Table)%>%
kable_styling(bootstrap_options = c("Striped", "hover"))
| X1 | Hundreds | Groups |
|---|---|---|
| Best Classifier | Cart | knn |
| Mean Accuracy | 0.2 | 0.66 |