R Notebook

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

# download training data set
training <- read.csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv", header=T, na.strings=c("","NA"),row.names=1)

# Read in test data too
test <- read.csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv', header=T, na.strings=c("",'NA'), row.names=1)


ncol(training)  # 159 features

## [1] 159

ncol(test)      # 159 features

## [1] 159

# In terms of number of features, the train and test data are same.
# Thats good. Now , lets see if the contents of these features are
# also same.
all(colnames(training) %in% colnames(test))

## [1] FALSE

# Result is False. So although the feature contents are same lenght
# the contents of these features are not exactly same. I will deal with
# this later.

# Remove all columns that has at least one NA
training2 = training[ , colSums(is.na(training)) == 0]
ncol(training2)

## [1] 59

# Summary: After removing columns with NA , I was left with 59 features including the classe column.
# Lets look at the classe column to see what 
# we are trying to predict

# We are trying to predict the classe column. So that makes it
# our target variable. It is a classification problem, so
# we will narrow our algorithms to the ones that handle classification.

# First lets remove features that are not varying in the data set
# This will not help us with classifying the data set.
# Use the nearZeroVar package in caret to do this

# Load caret package
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

# Removing non varying feature
training3 = training2[,-nearZeroVar(training2)]
ncol(training3)

## [1] 58

# Only one feature was removed. I have 58 columns.

# Remove user_name column and 3 other columns that are just describing
# time of events. 
training4 = training3[,-c(1,2,3,4)]

# I want to scale the data since all I have now is continous data
# and most algorithms work better with scaled continous data.
# Note I also have some categorical data...so....
# Save all continous data into a separate data frame, scale them and then
# add back the classe column to the scaled data set
training5 = training4[,-ncol(training4)]
training6 = scale(training5, center=TRUE, scale=TRUE)
training8 = as.data.frame(training4[,ncol(training4)])
colnames(training8) = 'classe'
training9 =cbind(training6,training8)

# Splitting training data into train and validation set
# 75 percent was used for training and 25 percent set aside for validation
inTrain = createDataPartition(y=training9$classe,p=0.75,list=FALSE)
train = training9[inTrain,]
validation = training9[-inTrain,]
nrow(train)

## [1] 14718

nrow(validation)

## [1] 4904

# Summary: We now have ~14,000 data points for training
# and ~4000 data points for validation

# Target is the classe column and this has a factor level of 5
# A, B,C,D,E. So this is a classification problem.

# Lets define the name of the target variable
Target = 'classe'
# Now define the predictor variables
PredictorNames = setdiff(names(train),Target)

# Benchmarking and selecting appropriate algorithm for data set
# This is a classification problem, so I chose classification
# algorithms implemented in caret. There are a couple: I
# decided to look at 4 which include rpart, lda and knn and random forest
# I dont know which one will train the best model for the
# data. So I wrote a for loop that trains a model with
# each algorithm and returns back the accuracy of
# the model. 
# I used a cross validation of 3. 

result = NULL
models = c('rpart','lda','knn','rf')

 for (model in models) {
       BenchMarkModel1 = train( classe~.,data=train, method=model,trControl=trainControl(method='cv',number=3))                          
       Accuracy=BenchMarkModel1$results$Accuracy[1]
       #print (Accuracy)
       #result = rbind(result,Accuracy)
       print (paste(model , Accuracy))
  }

## Loading required package: rpart

## [1] "rpart 0.591177975188258"

## Loading required package: MASS

## [1] "lda 0.711237945840184"
## [1] "knn 0.94686841654795"

## Loading required package: randomForest

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## [1] "rf 0.991982550704233"

# Result showed that random forest model was the best with an accuracy of .99

# Now use rf to train and then use the model trained to predict the
# validation data set
#Training the model with a cross validation of 10
RF_Model = train( classe~.,data=train, method="rf" ,trControl=trainControl(method='cv',number=10))
# Using the model to predict valdiation data set
predictions = predict(RF_Model,validation[,PredictorNames])
# Evaluate model accuracy on validation data set by comparing
# predictions from model with truth
Evaluation = confusionMatrix(predictions, validation[,Target])

# From the confusion matrix , the 95% CI for accuracy is
# 0.9957 to 0.9987. So my out of sample error I estimate
# as being between .13 percent to .43 percent. Less than 1 percent
# Get the accuracy of this comparison
New =Evaluation$overall['Accuracy']
New

##  Accuracy 
## 0.9977569

# The accuracy is 0.99 on a validation data set

# Now lets use this model to predict the test data set for the assignment

# Pre processing of test data to match features used
# to train the train data
Target = 'classe'
# Get all the feature names used in training, except the
# target being predicted
PredictorNames = setdiff(names(training9),Target)
# Subset the appropriate data from the original test
# data using the feature names
New_test_data = test[,PredictorNames]

# Scale the test data just as training data
test_data_scaled = scale(New_test_data, center=TRUE, scale=TRUE)
# Now use the RF_model to make predictions on the scaled test data
Test_predictions = predict(RF_Model,test_data_scaled)
Test_predictions

##  [1] E A A A C E D D A E B C D A E D E B E E
## Levels: A B C D E

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).