#Для выполнения задания, необходимо подключить следующие пакеты: library(ggplot2) library(RColorBrewer) library(rattle) library(caret) library(rpart) library(rpart.plot) library(randomForest)
#Указываем путь setwd(“C:/Users/79881/Documents”)
#Загружаем CSV файлы Train1 <- read.csv(“pml-training.csv”) Test1 <- read.csv(“pml-testing.csv”)
#Используем функцию Near Zero Variance, чтобы различить переменные, которые не объясняют или не добавляют вариации к данным. NZV <- nearZeroVar(Train1, saveMetrics = TRUE)
Training <- Train1[, !NZV\(nzv] Testing <- Test1[, !NZV\)nzv]
#Удаляем столбцы As Subject и timeStamps
Training <- Training[, -c(1:5)] Testing <- Testing[, -c(1:5)]
#Удаляем пустые столбцы (NA) isNAcolsums <- colSums(is.na(Training)) == 0 Training <- Training[, isNAcolsums] Testing <- Testing[, isNAcolsums]
##Тренировочный набор разделен на два набора: #Чистый набор обучающих данных (70%), набор - InTrain #Набор данных проверки (30%). (Training)
set.seed(56789) inTrain <- createDataPartition(Training$classe, p = 0.70, list = FALSE) Training <- Training[inTrain, ] Validation <- Training[-inTrain, ]
#Строим прогноз с помощью дерева решений PredictTree <- rpart(classe ~ ., data = Training, method = “class”) prp(PredictTree, main =“Прогноз дерева решений”, box.col = c(“red”,“green”,“blue”,“yellow”,“brown”))
#Производительность модели на наборе данных проверки Validation\(classe <- as.factor(Validation\)classe) ValPredictTree <- predict(PredictTree, Validation, type = “class”) confusionMatrix(Validation$classe, ValPredictTree)
#
#Confusion Matrix and Statistics
#
# Reference
#Prediction A B C D E
# A 722 73 2 24 5
# B 74 375 40 40 19
# C 15 24 432 28 3
# D 35 78 68 277 1
# E 43 88 26 52 330
#
# Overall Statistics
#
# Accuracy : 0.7432
# 95% CI : (0.7268, 0.7591)
# No Information Rate : 0.3093
# P-Value [Acc > NIR] : < 2.2e-16
#
# Kappa : 0.6739
#
#Mcnemar's Test P-Value : < 2.2e-16
#Statistics by Class:
#
# Class: A Class: B Class: C Class: D Class: E
#Sensitivity 0.8121 0.5878 0.7606 0.65796 0.9218
#Specificity 0.9476 0.9226 0.9696 0.92581 0.9169
#Pos Pred Value 0.8741 0.6843 0.8606 0.60349 0.6122
#Neg Pred Value 0.9185 0.8869 0.9427 0.94037 0.9880
#Prevalence 0.3093 0.2220 0.1976 0.14649 0.1246
#Detection Rate 0.2512 0.1305 0.1503 0.09638 0.1148
#Detection Prevalence 0.2874 0.1907 0.1747 0.15971 0.1875
#Balanced Accuracy 0.8799 0.7552 0.8651 0.79188 0.9194
accuracy.Tree <- postResample(ValPredictTree, Validation\(classe) OSE.Tree <- 1 - as.numeric(confusionMatrix(Validation\)classe, ValPredictTree)$overall[1])
#Подгоняем прогноз на основе алгоритма Случайный лес RF <- train(classe ~ ., data = Training, method = “rf”,trControl = trainControl(method = “cv”, 5), ntree = 150, allowParallel = TRUE)
# Random Forest
#
# 13737 samples
# 53 predictor
# 5 classes: 'A', 'B', 'C', 'D', 'E'
#
# No pre-processing
# Resampling: Cross-Validated (5 fold)
# Summary of sample sizes: 10988, 10990, 10991, 10990, 10989
# Resampling results across tuning parameters:
#
# mtry Accuracy Kappa
# 2 0.9933033 0.9915283
# 27 0.9971614 0.9964095
# 53 0.9938853 0.9922657
#
# Accuracy was used to select the optimal model using the largest value.
# The final value used for the model was mtry = 27.
#Производительность модели на наборе данных проверки: PredictRF <- predict(RF, Validation) confusionMatrix(Validation$classe, PredictRF)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1192 0 0 0 0
## B 0 782 0 0 0
## C 0 0 725 0 0
## D 0 0 0 672 0
## E 0 0 0 0 740
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9991, 1)
## No Information Rate : 0.29
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.00 1.0000 1.0000 1.0000 1.00
## Specificity 1.00 1.0000 1.0000 1.0000 1.00
## Pos Pred Value 1.00 1.0000 1.0000 1.0000 1.00
## Neg Pred Value 1.00 1.0000 1.0000 1.0000 1.00
## Prevalence 0.29 0.1902 0.1764 0.1635 0.18
## Detection Rate 0.29 0.1902 0.1764 0.1635 0.18
## Detection Prevalence 0.29 0.1902 0.1764 0.1635 0.18
## Balanced Accuracy 1.00 1.0000 1.0000 1.0000 1.00
#Производительность модели на наборе данных проверки: accuracyRF <- postResample(PredictRF, Validation\(classe) OSE.RF <- 1 - as.numeric(confusionMatrix(Validation\)classe, PredictRF)$overall[1])