Load the data on my desktop, and remove the X variable.
pml.training <- read.csv("~/Desktop/pml-training.csv")
pml.testing <- read.csv("~/Desktop/pml-testing.csv")
pml.training<-pml.training[-1]
pml.testing<-pml.testing[-1]
Load the packages with required packages.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(FSelector)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
Set the seed and do the in-sample partition of the training set.
set.seed(24)
training <- createDataPartition(y=pml.training$classe, p=0.6, list=FALSE)
set.training <- pml.training[training, ]
set.testing <- pml.training[-training, ]
Assign blank cells to NA and remove columns with missing raito larger than 0.5.
set.training[set.training==""]<-NA
na.factor<-c()
for(i in 1:length(set.training)) {
if(sum(is.na( set.training[, i])) / nrow(set.training) > 0.5 ) {
na.factor<-c(na.factor,i)
}
}
na.variables<-colnames(set.training[na.factor])
na.factor
## [1] 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
## [18] 28 29 30 31 32 33 34 35 49 50 51 52 53 54 55 56 57
## [35] 58 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 86
## [52] 87 88 89 90 91 92 93 94 95 96 97 98 99 100 102 103 104
## [69] 105 106 107 108 109 110 111 124 125 126 127 128 129 130 131 132 133
## [86] 134 135 136 137 138 140 141 142 143 144 145 146 147 148 149
na.variables
## [1] "kurtosis_roll_belt" "kurtosis_picth_belt"
## [3] "kurtosis_yaw_belt" "skewness_roll_belt"
## [5] "skewness_roll_belt.1" "skewness_yaw_belt"
## [7] "max_roll_belt" "max_picth_belt"
## [9] "max_yaw_belt" "min_roll_belt"
## [11] "min_pitch_belt" "min_yaw_belt"
## [13] "amplitude_roll_belt" "amplitude_pitch_belt"
## [15] "amplitude_yaw_belt" "var_total_accel_belt"
## [17] "avg_roll_belt" "stddev_roll_belt"
## [19] "var_roll_belt" "avg_pitch_belt"
## [21] "stddev_pitch_belt" "var_pitch_belt"
## [23] "avg_yaw_belt" "stddev_yaw_belt"
## [25] "var_yaw_belt" "var_accel_arm"
## [27] "avg_roll_arm" "stddev_roll_arm"
## [29] "var_roll_arm" "avg_pitch_arm"
## [31] "stddev_pitch_arm" "var_pitch_arm"
## [33] "avg_yaw_arm" "stddev_yaw_arm"
## [35] "var_yaw_arm" "kurtosis_roll_arm"
## [37] "kurtosis_picth_arm" "kurtosis_yaw_arm"
## [39] "skewness_roll_arm" "skewness_pitch_arm"
## [41] "skewness_yaw_arm" "max_roll_arm"
## [43] "max_picth_arm" "max_yaw_arm"
## [45] "min_roll_arm" "min_pitch_arm"
## [47] "min_yaw_arm" "amplitude_roll_arm"
## [49] "amplitude_pitch_arm" "amplitude_yaw_arm"
## [51] "kurtosis_roll_dumbbell" "kurtosis_picth_dumbbell"
## [53] "kurtosis_yaw_dumbbell" "skewness_roll_dumbbell"
## [55] "skewness_pitch_dumbbell" "skewness_yaw_dumbbell"
## [57] "max_roll_dumbbell" "max_picth_dumbbell"
## [59] "max_yaw_dumbbell" "min_roll_dumbbell"
## [61] "min_pitch_dumbbell" "min_yaw_dumbbell"
## [63] "amplitude_roll_dumbbell" "amplitude_pitch_dumbbell"
## [65] "amplitude_yaw_dumbbell" "var_accel_dumbbell"
## [67] "avg_roll_dumbbell" "stddev_roll_dumbbell"
## [69] "var_roll_dumbbell" "avg_pitch_dumbbell"
## [71] "stddev_pitch_dumbbell" "var_pitch_dumbbell"
## [73] "avg_yaw_dumbbell" "stddev_yaw_dumbbell"
## [75] "var_yaw_dumbbell" "kurtosis_roll_forearm"
## [77] "kurtosis_picth_forearm" "kurtosis_yaw_forearm"
## [79] "skewness_roll_forearm" "skewness_pitch_forearm"
## [81] "skewness_yaw_forearm" "max_roll_forearm"
## [83] "max_picth_forearm" "max_yaw_forearm"
## [85] "min_roll_forearm" "min_pitch_forearm"
## [87] "min_yaw_forearm" "amplitude_roll_forearm"
## [89] "amplitude_pitch_forearm" "amplitude_yaw_forearm"
## [91] "var_accel_forearm" "avg_roll_forearm"
## [93] "stddev_roll_forearm" "var_roll_forearm"
## [95] "avg_pitch_forearm" "stddev_pitch_forearm"
## [97] "var_pitch_forearm" "avg_yaw_forearm"
## [99] "stddev_yaw_forearm" "var_yaw_forearm"
set.training<-set.training[-na.factor]
dim(set.training)
## [1] 11776 59
Exclude variables with low information gain and get 50 useful indicators.
weights<-information.gain(classe~.,set.training)
print(weights)
## attr_importance
## user_name 0.0062586027
## raw_timestamp_part_1 1.5725959981
## raw_timestamp_part_2 0.0000000000
## cvtd_timestamp 0.8342875645
## new_window 0.0001378553
## num_window 1.5871834158
## roll_belt 0.4590724270
## pitch_belt 0.4665361070
## yaw_belt 0.6123113534
## total_accel_belt 0.1697729296
## gyros_belt_x 0.0830804825
## gyros_belt_y 0.1228110648
## gyros_belt_z 0.1829702726
## accel_belt_x 0.1197856822
## accel_belt_y 0.1327965264
## accel_belt_z 0.2552771709
## magnet_belt_x 0.1322141967
## magnet_belt_y 0.2112466915
## magnet_belt_z 0.2301516985
## roll_arm 0.1393831845
## pitch_arm 0.0744428853
## yaw_arm 0.0961434276
## total_accel_arm 0.0780826038
## gyros_arm_x 0.1376371702
## gyros_arm_y 0.1046174708
## gyros_arm_z 0.0366178143
## accel_arm_x 0.1064312904
## accel_arm_y 0.1119720526
## accel_arm_z 0.0931272275
## magnet_arm_x 0.1297399515
## magnet_arm_y 0.1047247255
## magnet_arm_z 0.0964243667
## roll_dumbbell 0.1839672851
## pitch_dumbbell 0.1442416099
## yaw_dumbbell 0.1830960353
## total_accel_dumbbell 0.1539185521
## gyros_dumbbell_x 0.0813706973
## gyros_dumbbell_y 0.1754185613
## gyros_dumbbell_z 0.0569969148
## accel_dumbbell_x 0.2004004879
## accel_dumbbell_y 0.1717197499
## accel_dumbbell_z 0.2204389817
## magnet_dumbbell_x 0.2519070136
## magnet_dumbbell_y 0.2937401230
## magnet_dumbbell_z 0.2298041118
## roll_forearm 0.2743400674
## pitch_forearm 0.1989772578
## yaw_forearm 0.0792408866
## total_accel_forearm 0.0731406800
## gyros_forearm_x 0.0422398606
## gyros_forearm_y 0.0826459839
## gyros_forearm_z 0.0514385900
## accel_forearm_x 0.1211613352
## accel_forearm_y 0.0918417767
## accel_forearm_z 0.0632870893
## magnet_forearm_x 0.1282669557
## magnet_forearm_y 0.1348852424
## magnet_forearm_z 0.0851335414
factor.train<-cutoff.k(weights,50)
factor.train
## [1] "num_window" "raw_timestamp_part_1" "cvtd_timestamp"
## [4] "yaw_belt" "pitch_belt" "roll_belt"
## [7] "magnet_dumbbell_y" "roll_forearm" "accel_belt_z"
## [10] "magnet_dumbbell_x" "magnet_belt_z" "magnet_dumbbell_z"
## [13] "accel_dumbbell_z" "magnet_belt_y" "accel_dumbbell_x"
## [16] "pitch_forearm" "roll_dumbbell" "yaw_dumbbell"
## [19] "gyros_belt_z" "gyros_dumbbell_y" "accel_dumbbell_y"
## [22] "total_accel_belt" "total_accel_dumbbell" "pitch_dumbbell"
## [25] "roll_arm" "gyros_arm_x" "magnet_forearm_y"
## [28] "accel_belt_y" "magnet_belt_x" "magnet_arm_x"
## [31] "magnet_forearm_x" "gyros_belt_y" "accel_forearm_x"
## [34] "accel_belt_x" "accel_arm_y" "accel_arm_x"
## [37] "magnet_arm_y" "gyros_arm_y" "magnet_arm_z"
## [40] "yaw_arm" "accel_arm_z" "accel_forearm_y"
## [43] "magnet_forearm_z" "gyros_belt_x" "gyros_forearm_y"
## [46] "gyros_dumbbell_x" "yaw_forearm" "total_accel_arm"
## [49] "pitch_arm" "total_accel_forearm"
Get the formula and random forest model.
formula.train <- as.simple.formula(factor.train, "classe")
print(formula.train)
## classe ~ num_window + raw_timestamp_part_1 + cvtd_timestamp +
## yaw_belt + pitch_belt + roll_belt + magnet_dumbbell_y + roll_forearm +
## accel_belt_z + magnet_dumbbell_x + magnet_belt_z + magnet_dumbbell_z +
## accel_dumbbell_z + magnet_belt_y + accel_dumbbell_x + pitch_forearm +
## roll_dumbbell + yaw_dumbbell + gyros_belt_z + gyros_dumbbell_y +
## accel_dumbbell_y + total_accel_belt + total_accel_dumbbell +
## pitch_dumbbell + roll_arm + gyros_arm_x + magnet_forearm_y +
## accel_belt_y + magnet_belt_x + magnet_arm_x + magnet_forearm_x +
## gyros_belt_y + accel_forearm_x + accel_belt_x + accel_arm_y +
## accel_arm_x + magnet_arm_y + gyros_arm_y + magnet_arm_z +
## yaw_arm + accel_arm_z + accel_forearm_y + magnet_forearm_z +
## gyros_belt_x + gyros_forearm_y + gyros_dumbbell_x + yaw_forearm +
## total_accel_arm + pitch_arm + total_accel_forearm
## <environment: 0x7fd13cbbaa98>
model.train <- randomForest(formula.train, data=set.training)
Use the model to predict in-sample testing data and compare results with truth.
prediction.insample <- predict(model.train, set.testing, type = "class")
confusionMatrix(prediction.insample, set.testing$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 2231 3 0 0 0
## B 1 1515 3 0 0
## C 0 0 1365 0 0
## D 0 0 0 1285 0
## E 0 0 0 1 1442
##
## Overall Statistics
##
## Accuracy : 0.999
## 95% CI : (0.998, 0.9996)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9987
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9996 0.9980 0.9978 0.9992 1.0000
## Specificity 0.9995 0.9994 1.0000 1.0000 0.9998
## Pos Pred Value 0.9987 0.9974 1.0000 1.0000 0.9993
## Neg Pred Value 0.9998 0.9995 0.9995 0.9998 1.0000
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2843 0.1931 0.1740 0.1638 0.1838
## Detection Prevalence 0.2847 0.1936 0.1740 0.1638 0.1839
## Balanced Accuracy 0.9995 0.9987 0.9989 0.9996 0.9999
Adjust the levels of variables in testing data.
levels(pml.testing$new_window)<-
levels(set.training$new_window)
levels(pml.testing$cvtd_timestamp)<-
levels(set.training$cvtd_timestamp)
Use the train model to predict the testing data and get the answers.
answers<-predict(model.train,pml.testing,type="class")
answers
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
pml_write_files = function(x){
n = length(x)
for(i in 1:n){
filename = paste0("problem_id_",i,".txt")
write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.names=FALSE)
}
}
pml_write_files(answers)
More information and reference, please visit my site: tiiome.com