Analysis

Functions and libraries

First of all we will install packages and declare necessary functions. They will help us later.

###functions

#install package and load library
loadpackage <- function (name) 
{
        if (!require(name, character.only = T)){
                install.packages(name)
                library(package = name, character.only = T)
        }   
}
 
#return vector if indexes to split dataset. Vector length is equal to: "dataset length*Alpha"
splitdf <- function(dataframe, Alpha=0.5) {
        index <- 1:nrow(dataframe)
        trainindex <- sample(index, round(length(index)*Alpha))
}

#prepare dataset before fitting
preparedf <- function(df){
         df <-df[,!(names(df) %in% nearZeroVars)]
         df <- df[,!(names(df) %in% navars)]
         df <- df[,-sysvars]
        return(df)
}

 
loadpackage("corrplot")
loadpackage("caret")

Downloading Data

There are two datasets: main (we will fit our model on it) and validate (for predictions).

#Don't forget to set your working directory
path <- "training.csv"
if (!file.exists(path))  {
        download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv",path)
}
originaldf <- read.csv(path, na.strings = c("#DIV/0!","NA",""))

Data partition

We should split train dataset to train and test data:

#we want our analysis to be reproducible. So we need to set seed:
set.seed(42)

#create train and test datasets. Train dataset is 75% of original dataset
inTrain <- splitdf(originaldf,0.75)
train <- originaldf[inTrain,]
test <- originaldf[-inTrain,]
dim(train)

## [1] 14716   160

Demension reducing

Dataset has 160 variables. It’s a large number and it may be difficult (and/or will take a lot of time) to fit model using all of them. So we need reduce dimension of train dataset. For this we should detect less important variables and exclude them from dataframe.

There are “service” variables:x,user_name, raw_timestamp_part_1,raw_timestamp_part_2,cvtd_timestamp, new_window, num_window. They don’t contain any useful data for prediction. X is just a row number, user names and time-variables contain iformation about participant,date and time of exercise - they are useless for prediction, because for future participant there will be another data. So we can delete them:

#remove system vars
sysvars <- c(1:7)
train <- train[,-sysvars]

Some variables have zero variance. So they haven’t any value for prediction. Let’s exclude them too:

#remove predictots with zero variance
nzvdf <- nearZeroVar(train,saveMetrics = TRUE)
nearZeroVars <- rownames(nzvdf[nzvdf$zeroVar==TRUE|nzvdf$nzv==TRUE,])
train <- train[,!(names(train) %in% nearZeroVars)]
dim(train)

## [1] 14716   122

Dimension became less, but not enough. So we can remove variables that consist mostly from “NA” values.

#remove NA's variables
navars <- NULL
for (i in (1:ncol(train))){
        naratio <-  sum(is.na(train[,i]))/nrow(train)
        if (naratio>=0.95){
                print(paste(round(naratio,3),names(train)[i]))
                navars <- c(navars,names(train)[i])      
        }
}

## [1] "0.98 kurtosis_roll_belt"
## [1] "0.981 kurtosis_picth_belt"
## [1] "0.98 skewness_roll_belt"
## [1] "0.981 skewness_roll_belt.1"
## [1] "0.98 max_roll_belt"
## [1] "0.98 max_picth_belt"
## [1] "0.98 max_yaw_belt"
## [1] "0.98 min_roll_belt"
## [1] "0.98 min_pitch_belt"
## [1] "0.98 min_yaw_belt"
## [1] "0.98 amplitude_roll_belt"
## [1] "0.98 amplitude_pitch_belt"
## [1] "0.98 var_total_accel_belt"
## [1] "0.98 avg_roll_belt"
## [1] "0.98 stddev_roll_belt"
## [1] "0.98 var_roll_belt"
## [1] "0.98 avg_pitch_belt"
## [1] "0.98 stddev_pitch_belt"
## [1] "0.98 var_pitch_belt"
## [1] "0.98 avg_yaw_belt"
## [1] "0.98 stddev_yaw_belt"
## [1] "0.98 var_yaw_belt"
## [1] "0.98 var_accel_arm"
## [1] "0.983 kurtosis_roll_arm"
## [1] "0.983 kurtosis_picth_arm"
## [1] "0.98 kurtosis_yaw_arm"
## [1] "0.983 skewness_roll_arm"
## [1] "0.983 skewness_pitch_arm"
## [1] "0.98 skewness_yaw_arm"
## [1] "0.98 max_roll_arm"
## [1] "0.98 max_picth_arm"
## [1] "0.98 max_yaw_arm"
## [1] "0.98 min_roll_arm"
## [1] "0.98 min_pitch_arm"
## [1] "0.98 min_yaw_arm"
## [1] "0.98 amplitude_pitch_arm"
## [1] "0.98 amplitude_yaw_arm"
## [1] "0.98 kurtosis_roll_dumbbell"
## [1] "0.98 kurtosis_picth_dumbbell"
## [1] "0.98 skewness_roll_dumbbell"
## [1] "0.98 skewness_pitch_dumbbell"
## [1] "0.98 max_roll_dumbbell"
## [1] "0.98 max_picth_dumbbell"
## [1] "0.98 max_yaw_dumbbell"
## [1] "0.98 min_roll_dumbbell"
## [1] "0.98 min_pitch_dumbbell"
## [1] "0.98 min_yaw_dumbbell"
## [1] "0.98 amplitude_roll_dumbbell"
## [1] "0.98 amplitude_pitch_dumbbell"
## [1] "0.98 var_accel_dumbbell"
## [1] "0.98 avg_roll_dumbbell"
## [1] "0.98 stddev_roll_dumbbell"
## [1] "0.98 var_roll_dumbbell"
## [1] "0.98 avg_pitch_dumbbell"
## [1] "0.98 stddev_pitch_dumbbell"
## [1] "0.98 var_pitch_dumbbell"
## [1] "0.98 avg_yaw_dumbbell"
## [1] "0.98 stddev_yaw_dumbbell"
## [1] "0.98 var_yaw_dumbbell"
## [1] "0.984 kurtosis_roll_forearm"
## [1] "0.984 kurtosis_picth_forearm"
## [1] "0.984 skewness_roll_forearm"
## [1] "0.984 skewness_pitch_forearm"
## [1] "0.98 max_picth_forearm"
## [1] "0.984 max_yaw_forearm"
## [1] "0.98 min_pitch_forearm"
## [1] "0.984 min_yaw_forearm"
## [1] "0.98 amplitude_pitch_forearm"
## [1] "0.98 var_accel_forearm"

train <- train[,!(names(train) %in% navars)]

And remove output:

#remove output
output <- subset(train, select =classe)
train <- train[,-ncol(train)]
dim(train)

## [1] 14716    52

Principal component analysis

We have reduced our dataframe demension even in three times. But 58 variables - it is still too much predictors. Model fitting can take a lot of time. So we have to use principal components:

###PCA
#we will choose variables explaining 80% of variance
prProc <- preProcess(train, method = "pca",thresh = 0.8)
trainPC <- predict(prProc, train)
dim(trainPC)

## [1] 14716    12

pcacount <- dim(trainPC)[2]

There are only 12 principal components, so we can fit model yet.

Model fitting

We will use random forest algorithm with 10-k cross validation.

WARNING! Model fitting can take a lot (comparatively) of time. It takes about 10-15 minutes on PC with the following characteristics: Intel Core i7 M620 2.67 GHz, 8Gb RAM.

start.time <- Sys.time()

### fit random forest model. 
#In train control function default number of folds = 10 for cross-validation method
fitControl <- trainControl(method="cv") 
fm <- train(output$classe~., data=trainPC, method="rf", trControl=fitControl)

end.time <- Sys.time()
end.time-start.time

Time difference of 12.13272 mins

The final model is below. You can see estimated error.

fm$finalModel

## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 3.63%
## Confusion matrix:
##      A    B    C    D    E class.error
## A 4085   22   35   24   11  0.02202538
## B   63 2709   67   10    9  0.05213436
## C   16   30 2509   25    7  0.03015075
## D   20    9  114 2261    6  0.06182573
## E    4   25   19   18 2618  0.02459016

Model testing

Now we can test our model:

#prepare test dataset
test2 <- preparedf(test)
dim(test2)

## [1] 4906   53

#prepare test dataset PCs
testPC <- predict(prProc,test2[,-ncol(test2)])
#predict 
testPredict <- predict(fm,testPC)

#confusion matrix
cf <- confusionMatrix(test2$classe,testPredict)
accuracy <- round(cf$overall[1],2)
cf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1378    5   10    8    2
##          B   22  889   22    2    4
##          C   11   13  791   14    6
##          D   10    1   30  760    5
##          E    0    6    6    4  907
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9631          
##                  95% CI : (0.9574, 0.9682)
##     No Information Rate : 0.2896          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.9533          
##  Mcnemar's Test P-Value : 0.01536         
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9697   0.9726   0.9208   0.9645   0.9816
## Specificity            0.9928   0.9875   0.9891   0.9888   0.9960
## Pos Pred Value         0.9822   0.9468   0.9473   0.9429   0.9827
## Neg Pred Value         0.9877   0.9937   0.9833   0.9932   0.9957
## Prevalence             0.2896   0.1863   0.1751   0.1606   0.1883
## Detection Rate         0.2809   0.1812   0.1612   0.1549   0.1849
## Detection Prevalence   0.2860   0.1914   0.1702   0.1643   0.1881
## Balanced Accuracy      0.9813   0.9801   0.9550   0.9766   0.9888

one_error <- round(1/(1-accuracy))

Weight Lifting Exercise Dataset Analysis

George Fandeev

29 june 2016

Introduction