Load data and use medianImputation to repalce missing values.
library(caret)
## Warning: package 'caret' was built under R version 3.1.1
# Load data
train <- read.csv('/Users/xiangjiang/Documents/git/Data Science at Coursera/8.Practical Machine Learning/pml-training.csv')
test <- read.csv('/Users/xiangjiang/Documents/git/Data Science at Coursera/8.Practical Machine Learning/pml-testing.csv')
train <- train[,-1]
test <- test[,-1]
# remove columns with too much NA
col2remove <- apply(!is.na(train), 2, sum)>5000
train <- train[,col2remove]
test <- test[,col2remove]
train$user_name <- factor(train$user_name)
train$cvtd_timestamp <- factor(train$cvtd_timestamp)
train$new_window <- factor(train$new_window)
train$classe <- factor(train$classe)
test$user_name <- factor(test$user_name)
test$cvtd_timestamp <- factor(test$cvtd_timestamp)
test$new_window <- factor(test$new_window)
# get numeric columns
numeric_cols <- sapply(train, is.numeric)
# preprocessing for training data
prep <- preProcess(train[, numeric_cols], method = c("center","scale","medianImpute"))
newTrain <- predict(prep, train[, numeric_cols])
newTrain <- cbind(user_name=train$user_name,cvtd_timestamp=train$cvtd_timestamp,new_window=train$new_window,newTrain)
# preprocessing for testing data
newTest <- predict(prep, test[, numeric_cols])
newTest <- cbind(user_name=test$user_name,cvtd_timestamp=test$cvtd_timestamp,new_window=test$new_window,newTest)
The model is trained using decision tree and cross-validation is used to select the best model.
ctrl <- trainControl(method = "cv")
fit <- train(train[,]$classe~., method="ctree",data=newTrain[,], trControl = ctrl)
## Warning: package 'party' was built under R version 3.1.1
## Warning: package 'sandwich' was built under R version 3.1.1
The accuracy for acoss-validation are:
fit$results
## mincriterion Accuracy Kappa AccuracySD KappaSD
## 1 0.01 0.9785 0.9728 0.004183 0.005287
## 2 0.50 0.9785 0.9728 0.004183 0.005287
## 3 0.99 0.9779 0.9721 0.003965 0.005011
Accuracy was used to select the optimal model using the largest value. The final value used for the model was mincriterion = 0.5, and classification accuracy was 97.7%.
The predictions for the test set are:
prediction <- predict(fit, newTest)
prediction
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E