project_sulaiman

Install library

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(rpart)     
library(e1071)    
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Download dan load dataset

trainUrl <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
testUrl <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"

training <- read.csv(url(trainUrl), na.strings = c("NA", "#DIV/0!", ""))
testing <- read.csv(url(testUrl), na.strings = c("NA", "#DIV/0!", ""))

pra proses data

training <- training[, colSums(is.na(training)) == 0]
testing <- testing[, colSums(is.na(testing)) == 0]

training <- training[, -c(1:7)]
testing <- testing[, -c(1:7)]

testing <- testing[, names(testing) %in% names(training)[-ncol(training)]]

Split data train dan test

set.seed(123)
inTrain <- createDataPartition(training$classe, p = 0.7, list = FALSE)
trainingSet <- training[inTrain, ]
validationSet <- training[-inTrain, ]

trainingSet$classe <- as.factor(trainingSet$classe)
validationSet$classe <- factor(validationSet$classe, levels = levels(trainingSet$classe))

Latih model random forest

set.seed(12345)
model_rf <- randomForest(classe ~ ., data = trainingSet)

Prediksi data validasi

pred_rf <- predict(model_rf, newdata = validationSet)
confusionMatrix(pred_rf, validationSet$classe)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1674    3    0    0    0
##          B    0 1131    3    0    0
##          C    0    5 1023    8    4
##          D    0    0    0  956    4
##          E    0    0    0    0 1074
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9954         
##                  95% CI : (0.9933, 0.997)
##     No Information Rate : 0.2845         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9942         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   0.9930   0.9971   0.9917   0.9926
## Specificity            0.9993   0.9994   0.9965   0.9992   1.0000
## Pos Pred Value         0.9982   0.9974   0.9837   0.9958   1.0000
## Neg Pred Value         1.0000   0.9983   0.9994   0.9984   0.9983
## Prevalence             0.2845   0.1935   0.1743   0.1638   0.1839
## Detection Rate         0.2845   0.1922   0.1738   0.1624   0.1825
## Detection Prevalence   0.2850   0.1927   0.1767   0.1631   0.1825
## Balanced Accuracy      0.9996   0.9962   0.9968   0.9954   0.9963

Prediksi 20 data test

testing <- testing[, names(trainingSet)[-ncol(trainingSet)]]

final_predictions <- predict(model_rf, newdata = testing)
print(final_predictions)

##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  B  A  B  A  A  E  D  B  A  A  B  C  B  A  E  E  A  B  B  B 
## Levels: A B C D E