This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
library(caret) library(gbm) # For Gradient Boosted Machines (decision tree-based) library(dplyr) library(corrplot) # For correlation analysis
if(!file.exists(“data”)) dir.create(“data”)
train_url <- “https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv” test_url <- “https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv”
download.file(train_url, “./data/pml-training.csv”) download.file(test_url, “./data/pml-testing.csv”)
training <- read.csv(“./data/pml-training.csv”, na.strings = c(“NA”, “#DIV/0!”, "“)) testing <- read.csv(”./data/pml-testing.csv“, na.strings = c(”NA“,”#DIV/0!“,”"))
mostlyNA <- sapply(training, function(x) mean(is.na(x))) > 0.95 training <- training[, mostlyNA == FALSE] testing <- testing[, mostlyNA == FALSE]
nzv <- nearZeroVar(training) training <- training[, -nzv] testing <- testing[, -nzv]
training <- training[, -(1:5)] testing <- testing[, -(1:5)]
dim(training) # Should be 19622 x 54
numeric_vars <- sapply(training, is.numeric) corr_matrix <- cor(training[, numeric_vars])
corrplot(corr_matrix, method = “color”, type = “lower”, tl.cex = 0.4, tl.srt = 45, title = “Correlation Matrix of Numeric Predictors”, mar = c(0,0,1,0))
set.seed(12345) inTrain <- createDataPartition(training$classe, p = 0.7, list = FALSE) train_set <- training[inTrain, ] valid_set <- training[-inTrain, ]
control <- trainControl(method = “repeatedcv”, number = 3, repeats = 1, verboseIter = FALSE)
model_gbm <- train(classe ~ ., data = train_set, method = “gbm”, trControl = control, verbose = FALSE)
model_gbm
pred_valid <- predict(model_gbm, valid_set) cm <- confusionMatrix(pred_valid, valid_set$classe) cm
varImp_gbm <- varImp(model_gbm) plot(varImp_gbm, top = 10, main = “Top 10 Important Variables in GBM”)
final_predictions <- predict(model_gbm, testing) final_predictions
pml_write_files <- function(x) { n <- length(x) path <- “./predictions” if(!file.exists(path)) dir.create(path) for(i in 1:n){ filename <- paste0(“problem_id_”, i, “.txt”) write.table(x[i], file = file.path(path, filename), quote = FALSE, row.names = FALSE, col.names = FALSE) } }
pml_write_files(final_predictions)
rmarkdown::render(“Prediction Assignment Writeup.Rmd”)