1. Load the data

library(caret)
library(RWeka)
library(Amelia)
library(ggplot2)

set.seed(12345)
url1 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
url2 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"

#download.file(url1,method="curl",destfile="/Users/alex/Documents/R directory/Machine learning/training.csv")
#download.file(url2,method="curl",destfile="/Users/alex/Documents/R directory/Machine learning/testing.csv")
fileDownload <- date()

2. Read the files into separate tables.

training <- read.csv("/Users/alex/Documents/R directory/Machine learning/training.csv")
test <- read.csv("/Users/alex/Documents/R directory/Machine learning/testing.csv")

2. Clean the dataset

##Assign factor levels to predicted variable
training$classe <- as.factor(training$classe)
#Visualize missing values
missmap(training, main="Missings Map in Human Activity Recognition training dataset", col=c("salmon", "purple"), legend=FALSE)

##Remove missing values columns
table(colSums(is.na(training)))

## 
##     0 19216 
##    93    67

miss <- colnames(training[colSums(is.na(training))>1])
training <- training[,!colnames(training) %in% miss]
training <- na.omit(training)
##Remove the near zero variance predictors
nzv <- nearZeroVar(training)
training <- training[,-nzv]
training <- training[,-c(1:6)]
str(training)

## 'data.frame':    19622 obs. of  53 variables:
##  $ roll_belt           : num  1.41 1.41 1.42 1.48 1.48 1.45 1.42 1.42 1.43 1.45 ...
##  $ pitch_belt          : num  8.07 8.07 8.07 8.05 8.07 8.06 8.09 8.13 8.16 8.17 ...
##  $ yaw_belt            : num  -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 ...
##  $ total_accel_belt    : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ gyros_belt_x        : num  0 0.02 0 0.02 0.02 0.02 0.02 0.02 0.02 0.03 ...
##  $ gyros_belt_y        : num  0 0 0 0 0.02 0 0 0 0 0 ...
##  $ gyros_belt_z        : num  -0.02 -0.02 -0.02 -0.03 -0.02 -0.02 -0.02 -0.02 -0.02 0 ...
##  $ accel_belt_x        : int  -21 -22 -20 -22 -21 -21 -22 -22 -20 -21 ...
##  $ accel_belt_y        : int  4 4 5 3 2 4 3 4 2 4 ...
##  $ accel_belt_z        : int  22 22 23 21 24 21 21 21 24 22 ...
##  $ magnet_belt_x       : int  -3 -7 -2 -6 -6 0 -4 -2 1 -3 ...
##  $ magnet_belt_y       : int  599 608 600 604 600 603 599 603 602 609 ...
##  $ magnet_belt_z       : int  -313 -311 -305 -310 -302 -312 -311 -313 -312 -308 ...
##  $ roll_arm            : num  -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 ...
##  $ pitch_arm           : num  22.5 22.5 22.5 22.1 22.1 22 21.9 21.8 21.7 21.6 ...
##  $ yaw_arm             : num  -161 -161 -161 -161 -161 -161 -161 -161 -161 -161 ...
##  $ total_accel_arm     : int  34 34 34 34 34 34 34 34 34 34 ...
##  $ gyros_arm_x         : num  0 0.02 0.02 0.02 0 0.02 0 0.02 0.02 0.02 ...
##  $ gyros_arm_y         : num  0 -0.02 -0.02 -0.03 -0.03 -0.03 -0.03 -0.02 -0.03 -0.03 ...
##  $ gyros_arm_z         : num  -0.02 -0.02 -0.02 0.02 0 0 0 0 -0.02 -0.02 ...
##  $ accel_arm_x         : int  -288 -290 -289 -289 -289 -289 -289 -289 -288 -288 ...
##  $ accel_arm_y         : int  109 110 110 111 111 111 111 111 109 110 ...
##  $ accel_arm_z         : int  -123 -125 -126 -123 -123 -122 -125 -124 -122 -124 ...
##  $ magnet_arm_x        : int  -368 -369 -368 -372 -374 -369 -373 -372 -369 -376 ...
##  $ magnet_arm_y        : int  337 337 344 344 337 342 336 338 341 334 ...
##  $ magnet_arm_z        : int  516 513 513 512 506 513 509 510 518 516 ...
##  $ roll_dumbbell       : num  13.1 13.1 12.9 13.4 13.4 ...
##  $ pitch_dumbbell      : num  -70.5 -70.6 -70.3 -70.4 -70.4 ...
##  $ yaw_dumbbell        : num  -84.9 -84.7 -85.1 -84.9 -84.9 ...
##  $ total_accel_dumbbell: int  37 37 37 37 37 37 37 37 37 37 ...
##  $ gyros_dumbbell_x    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ gyros_dumbbell_y    : num  -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 ...
##  $ gyros_dumbbell_z    : num  0 0 0 -0.02 0 0 0 0 0 0 ...
##  $ accel_dumbbell_x    : int  -234 -233 -232 -232 -233 -234 -232 -234 -232 -235 ...
##  $ accel_dumbbell_y    : int  47 47 46 48 48 48 47 46 47 48 ...
##  $ accel_dumbbell_z    : int  -271 -269 -270 -269 -270 -269 -270 -272 -269 -270 ...
##  $ magnet_dumbbell_x   : int  -559 -555 -561 -552 -554 -558 -551 -555 -549 -558 ...
##  $ magnet_dumbbell_y   : int  293 296 298 303 292 294 295 300 292 291 ...
##  $ magnet_dumbbell_z   : num  -65 -64 -63 -60 -68 -66 -70 -74 -65 -69 ...
##  $ roll_forearm        : num  28.4 28.3 28.3 28.1 28 27.9 27.9 27.8 27.7 27.7 ...
##  $ pitch_forearm       : num  -63.9 -63.9 -63.9 -63.9 -63.9 -63.9 -63.9 -63.8 -63.8 -63.8 ...
##  $ yaw_forearm         : num  -153 -153 -152 -152 -152 -152 -152 -152 -152 -152 ...
##  $ total_accel_forearm : int  36 36 36 36 36 36 36 36 36 36 ...
##  $ gyros_forearm_x     : num  0.03 0.02 0.03 0.02 0.02 0.02 0.02 0.02 0.03 0.02 ...
##  $ gyros_forearm_y     : num  0 0 -0.02 -0.02 0 -0.02 0 -0.02 0 0 ...
##  $ gyros_forearm_z     : num  -0.02 -0.02 0 0 -0.02 -0.03 -0.02 0 -0.02 -0.02 ...
##  $ accel_forearm_x     : int  192 192 196 189 189 193 195 193 193 190 ...
##  $ accel_forearm_y     : int  203 203 204 206 206 203 205 205 204 205 ...
##  $ accel_forearm_z     : int  -215 -216 -213 -214 -214 -215 -215 -213 -214 -215 ...
##  $ magnet_forearm_x    : int  -17 -18 -18 -16 -17 -9 -18 -9 -16 -22 ...
##  $ magnet_forearm_y    : num  654 661 658 658 655 660 659 660 653 656 ...
##  $ magnet_forearm_z    : num  476 473 469 469 473 478 470 474 476 473 ...
##  $ classe              : Factor w/ 5 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...

Model training

inTrain <- createDataPartition(y=training$classe,p=.7,list=F)
testing1 <- training[-inTrain,]
training1 <- training[inTrain,]
nrow(training1)

## [1] 13737

nrow(testing1)

## [1] 5885

#Cross validation is used to train the model with 3 folds:
#fitControl <- trainControl(method = "cv", number = 3)

#fit <- train(classe~.,method="gbm",data=training1,verbose=F,tuneLength = 5)

#saveRDS(fit, "gbm.RDS")
modelFit <- readRDS("gbm.RDS")
varImp(modelFit$finalModel, scale=F)

##                          Overall
## roll_belt            3004.093477
## pitch_belt            679.993968
## yaw_belt             1648.889106
## total_accel_belt       20.856205
## gyros_belt_x            5.491205
## gyros_belt_y           86.565344
## gyros_belt_z          434.596881
## accel_belt_x            6.604805
## accel_belt_y           42.358991
## accel_belt_z          191.374454
## magnet_belt_x         197.878859
## magnet_belt_y         210.818436
## magnet_belt_z         623.077487
## roll_arm              117.009951
## pitch_arm              20.422464
## yaw_arm               333.345001
## total_accel_arm        33.315643
## gyros_arm_x            39.760450
## gyros_arm_y           122.725285
## gyros_arm_z             0.000000
## accel_arm_x            76.385023
## accel_arm_y            52.890282
## accel_arm_z            22.002730
## magnet_arm_x           72.423170
## magnet_arm_y          159.564827
## magnet_arm_z          185.908975
## roll_dumbbell         456.561032
## pitch_dumbbell         30.514983
## yaw_dumbbell           99.766805
## total_accel_dumbbell  112.713723
## gyros_dumbbell_x       80.708692
## gyros_dumbbell_y      278.494692
## gyros_dumbbell_z       20.389388
## accel_dumbbell_x      217.137012
## accel_dumbbell_y      294.569673
## accel_dumbbell_z      341.248980
## magnet_dumbbell_x     252.266311
## magnet_dumbbell_y    1013.621658
## magnet_dumbbell_z    1106.217746
## roll_forearm          807.672343
## pitch_forearm        1577.666871
## yaw_forearm            17.568953
## total_accel_forearm    41.468195
## gyros_forearm_x         9.576312
## gyros_forearm_y        50.887809
## gyros_forearm_z        29.283814
## accel_forearm_x       311.183222
## accel_forearm_y        74.968003
## accel_forearm_z       369.010177
## magnet_forearm_x      145.863822
## magnet_forearm_y       60.842073
## magnet_forearm_z      358.652175

print(modelFit)

## Stochastic Gradient Boosting 
## 
## 13737 samples
##    52 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## 
## Summary of sample sizes: 13737, 13737, 13737, 13737, 13737, 13737, ... 
## 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa      Accuracy SD
##   1                   50      0.7516434  0.6849270  0.009168248
##   1                  100      0.8196163  0.7716189  0.005586482
##   1                  150      0.8506829  0.8109837  0.005845992
##   1                  200      0.8704462  0.8360045  0.005620265
##   1                  250      0.8839624  0.8530955  0.004864760
##   2                   50      0.8537421  0.8146310  0.005198881
##   2                  100      0.9034158  0.8777253  0.004886976
##   2                  150      0.9282319  0.9091575  0.004043820
##   2                  200      0.9443170  0.9295246  0.003364400
##   2                  250      0.9556296  0.9438435  0.002573835
##   3                   50      0.8931105  0.8646357  0.006337308
##   3                  100      0.9385834  0.9222610  0.004063111
##   3                  150      0.9585646  0.9475615  0.002034929
##   3                  200      0.9682432  0.9598137  0.002076093
##   3                  250      0.9744680  0.9676934  0.002215972
##   4                   50      0.9193958  0.8979421  0.004746302
##   4                  100      0.9566473  0.9451295  0.003316027
##   4                  150      0.9704781  0.9626419  0.002475504
##   4                  200      0.9775544  0.9715989  0.002262608
##   4                  250      0.9817048  0.9768510  0.002065833
##   5                   50      0.9364768  0.9195725  0.003606252
##   5                  100      0.9665538  0.9576745  0.002197911
##   5                  150      0.9767470  0.9705757  0.001904366
##   5                  200      0.9820477  0.9772850  0.001886927
##   5                  250      0.9846874  0.9806250  0.001495849
##   Kappa SD   
##   0.011676189
##   0.007064147
##   0.007375821
##   0.007091741
##   0.006135850
##   0.006560892
##   0.006174827
##   0.005120780
##   0.004246717
##   0.003249857
##   0.008019292
##   0.005130304
##   0.002559469
##   0.002613941
##   0.002791496
##   0.005992431
##   0.004182738
##   0.003126037
##   0.002854093
##   0.002609490
##   0.004569988
##   0.002754739
##   0.002403089
##   0.002377884
##   0.001890803
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were n.trees = 250,
##  interaction.depth = 5, shrinkage = 0.1 and n.minobsinnode = 10.

#Model accuracy plotting: 
plot(modelFit, metric="Accuracy")

Decision tree trial using Regressive Partitioning and Regression trees

library(rpart) # Regressive Partitioning and Regression trees
library(rpart.plot) # Decision Tree plot
treeModel <- rpart(classe ~ ., data=training1, method="class")

# Predicting:
treePrediction <- predict(treeModel, testing1, type = "class")

# Plot of the Decision Tree
rpart.plot(treeModel, main="Classification Tree", extra=102, under=TRUE, faclen=0)

confusionMatrix(treePrediction, testing1$classe)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1498  196   69  106   25
##          B   42  669   85   86   92
##          C   43  136  739  129  131
##          D   33   85   98  553   44
##          E   58   53   35   90  790
## 
## Overall Statistics
##                                           
##                Accuracy : 0.722           
##                  95% CI : (0.7104, 0.7334)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6467          
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.8949   0.5874   0.7203  0.57365   0.7301
## Specificity            0.9060   0.9357   0.9097  0.94717   0.9509
## Pos Pred Value         0.7909   0.6869   0.6273  0.68020   0.7700
## Neg Pred Value         0.9559   0.9043   0.9390  0.91897   0.9399
## Prevalence             0.2845   0.1935   0.1743  0.16381   0.1839
## Detection Rate         0.2545   0.1137   0.1256  0.09397   0.1342
## Detection Prevalence   0.3218   0.1655   0.2002  0.13815   0.1743
## Balanced Accuracy      0.9004   0.7615   0.8150  0.76041   0.8405

Outcome predicting using Stochastic Gradient Boosting

predictions <- predict(modelFit,testing1)
confusionMatrix(predictions,testing1$classe)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1670   10    0    0    0
##          B    4 1119   17    0    1
##          C    0   10 1005   18    1
##          D    0    0    4  944    5
##          E    0    0    0    2 1075
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9878          
##                  95% CI : (0.9846, 0.9904)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9845          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9976   0.9824   0.9795   0.9793   0.9935
## Specificity            0.9976   0.9954   0.9940   0.9982   0.9996
## Pos Pred Value         0.9940   0.9807   0.9720   0.9906   0.9981
## Neg Pred Value         0.9990   0.9958   0.9957   0.9959   0.9985
## Prevalence             0.2845   0.1935   0.1743   0.1638   0.1839
## Detection Rate         0.2838   0.1901   0.1708   0.1604   0.1827
## Detection Prevalence   0.2855   0.1939   0.1757   0.1619   0.1830
## Balanced Accuracy      0.9976   0.9889   0.9868   0.9887   0.9966

predictfinal <- predict(modelFit, test)
predictfinal

##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E

Human Activity Prediction

Alex Tselsov

1. Load the data

2. Read the files into separate tables.

2. Clean the dataset

Model training

Decision tree trial using Regressive Partitioning and Regression trees

Outcome predicting using Stochastic Gradient Boosting