Using devices such as Jawbone Up, Nike FuelBand, and Fitbit it is now possible to collect a large amount of data about personal activity relatively inexpensively. These type of devices are part of the quantified self movement – a group of enthusiasts who take measurements about themselves regularly to improve their health, to find patterns in their behavior, or because they are tech geeks. One thing that people regularly do is quantify how much of a particular activity they do, but they rarely quantify how well they do it.
In this project, your goal will be to use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants. They were asked to perform barbell lifts correctly and incorrectly in 5 different ways.
More information is available from the website here: http://groupware.les.inf.puc-rio.br/har
# Load training and test datasets
jb_train = read.delim(file = 'pml-training.csv', header = TRUE,
sep = ',', dec = '.')
jb_test = read.delim(file = 'pml-testing.csv', header = TRUE,
sep = ',', dec = '.')
# dividing the training into a validation and local test
data = jb_train
indexes = sample(1:nrow(data), size=0.2*nrow(data)) #Sample Indexes
# Split data
test = data[indexes,] # validation dataset with 3924 rows & 160 columns
train = data[-indexes,] # training set with 15698 rows & 160 columns
# create the tree model
tree_opt <- tree(classe ~ raw_timestamp_part_1 + raw_timestamp_part_2+
num_window + roll_belt + pitch_belt + yaw_belt+
total_accel_belt + gyros_belt_x + gyros_belt_y +
gyros_belt_z + accel_arm_z + accel_belt_x+ accel_belt_y+
accel_belt_z + magnet_belt_x + magnet_belt_y+
magnet_belt_z + roll_arm + pitch_arm + yaw_arm+
total_accel_arm + gyros_arm_x + gyros_arm_y + gyros_arm_z+
accel_arm_x + accel_arm_y + accel_arm_z + magnet_arm_x+
magnet_arm_y + magnet_arm_z + roll_dumbbell + pitch_dumbbell+
yaw_dumbbell, data = train)
# create new output column for testing with validation dataset
Prediction <- predict(tree_opt, test, type = "class")
submit_tree <- data.frame(userId = test$X, cl_pred = Prediction)
chk_tree_local <- data.frame(Id = submit_tree$userId, org_predn = test$classe,
new_pred = submit_tree$cl_pred)
Step 1.2 - Validation with local set
table(chk_tree_local$org_pred, chk_tree_local$new_pred)
##
## A B C D E
## A 863 80 88 30 52
## B 242 354 106 7 62
## C 158 163 299 11 67
## D 238 53 50 182 122
## E 101 45 15 10 526
From the validation matrix, we see this model has an accuracy of about 57.11%
Step 1.3 - Making Predictions
Prediction <- predict(tree_opt, jb_test, type = "class")
submit <- data.frame(userId = jb_test$X, Class = Prediction)
submit$Class
## [1] B B B A A E E E A A B E A A E A A B A A
## Levels: A B C D E
# Output accuracy from quiz submission= 40%
set.seed(37) # for reproducability
b1 <- randomForest(classe ~ raw_timestamp_part_1 + raw_timestamp_part_2+
num_window + roll_belt + pitch_belt + yaw_belt+
total_accel_belt + gyros_belt_x + gyros_belt_y +
gyros_belt_z + accel_arm_z + accel_belt_x+ accel_belt_y+
accel_belt_z + magnet_belt_x + magnet_belt_y+
magnet_belt_z + roll_arm + pitch_arm + yaw_arm+
total_accel_arm + gyros_arm_x + gyros_arm_y + gyros_arm_z+
accel_arm_x + accel_arm_y + accel_arm_z + magnet_arm_x+
magnet_arm_y + magnet_arm_z + roll_dumbbell + pitch_dumbbell+
yaw_dumbbell , data = train,
importance =TRUE)
# visualize model
summary(b1)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 15698 factor numeric
## err.rate 3000 -none- numeric
## confusion 30 -none- numeric
## votes 78490 matrix numeric
## oob.times 15698 -none- numeric
## classes 5 -none- character
## importance 224 -none- numeric
## importanceSD 192 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 15698 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
plot(b1)
Step 2.2 - Validation with local set
Predrf_local <- predict(b1, test)
chkrf_local <- data.frame(userID = test$X, Classe_new = Predrf_local)
chk_rf_validn <- data.frame(Id = test$X, org_predn = test$classe,
new_pred = chkrf_local$Classe_new)
table(chk_rf_validn$org_pred, chk_rf_validn$new_pred)
##
## A B C D E
## A 1113 0 0 0 0
## B 1 770 0 0 0
## C 0 2 696 0 0
## D 0 0 0 645 0
## E 0 0 0 0 697
From the validation matrix, we see this model has an accuracy of about 99.89% with only 4 incorrect predictions!
Step 2.3 - Making Predictions
Predrf <- predict(b1, jb_test)
chkrf <- data.frame(userID = jb_test$X, Classe = Predrf)
submit <- data.frame(userId = jb_test$X, Class = Prediction)
# Output accuracy from quiz submission = 100%
fit2 <- naiveBayes(classe ~ raw_timestamp_part_1 + raw_timestamp_part_2+
num_window + roll_belt + pitch_belt + yaw_belt+
total_accel_belt + gyros_belt_x + gyros_belt_y +
gyros_belt_z + accel_arm_z + accel_belt_x+ accel_belt_y+
accel_belt_z + magnet_belt_x + magnet_belt_y+
magnet_belt_z + roll_arm + pitch_arm + yaw_arm+
total_accel_arm + gyros_arm_x + gyros_arm_y + gyros_arm_z+
accel_arm_x + accel_arm_y + accel_arm_z + magnet_arm_x+
magnet_arm_y + magnet_arm_z + roll_dumbbell + pitch_dumbbell+
yaw_dumbbell ,
data = train)
summary(fit2)
## Length Class Mode
## apriori 5 table numeric
## tables 32 -none- list
## levels 5 -none- character
## call 4 -none- call
Step 3.2 - Validation with local set
local_test <- predict(fit2, test)
submit_nb <- data.frame(userId = test$X, Class = local_test)
chk_nb <- data.frame(id = test$X, org_pred = test$classe,
new_pred = submit_nb$Class)
table(chk_nb$org_pred, chk_nb$new_pred)
##
## A B C D E
## A 688 129 115 108 73
## B 225 317 105 33 91
## C 237 94 229 94 44
## D 116 133 95 214 87
## E 94 114 39 109 341
From the validation matrix, we see this model has an accuracy of about 47.45% with ONLY 1862 correct entries & a staggering 2062 incorrect predictions!
Step 3.3 - Making Predictions
prednb_test <- predict(fit2, jb_test)
submit <- data.frame(userId = jb_test$X, Class = prednb_test)
# Output accuracy from quiz submission = 100%
| No. | Model Name | Model accuracy (local) | Output Accuracy | Remarks |
| 1. | Decision Trees | 57.11% | 40% | Performs slightly better than random guess! |
| 2. | Random Forest | 98.99% | 100% | Perfect Prediction |
| 3. | Naive-Bayes | 47.45% | 40% | Worst performance despite complexity. |