The code for this is online at https://github.com/luizfelipebrito/Supervised-Learning-Machine-Learning-with-R
The main purpose of this experiment is predicted if there is the presence or not of heart disease in the patient.
This database contains 76 attributes, but I am using a subset of 14 of them. The “goal” field refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4. Heart Disease UCI: https://www.kaggle.com/ronitf/heart-disease-uci
rm(list = ls())
cat("\014")
library(e1071)
library(tree)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
setwd("D:\\Machine_Learning")
database <- read.csv("heart.csv", header = TRUE)
names(database)[names(database) == "ï..age"] <- "age"
names(database)[names(database) == "sex"] <- "sex"
names(database)[names(database) == "cp"] <- "chest_pain_type"
names(database)[names(database) == "trestbps"] <- "resting_blood_pressure"
names(database)[names(database) == "chol"] <- "serum_cholestoral"
names(database)[names(database) == "fbs"] <- "fasting_blood_sugar"
names(database)[names(database) == "restecg"] <- "resting_cardiographic_results"
names(database)[names(database) == "thalach"] <- "maximum_heart_rate"
names(database)[names(database) == "exang"] <- "exercise_induced_angina"
names(database)[names(database) == "oldpeak"] <- "depression_induced_exercise_rest"
names(database)[names(database) == "slope"] <- "slope_peak_exercise"
names(database)[names(database) == "ca"] <- "number_major_vessels"
names(database)[names(database) == "thal"] <- "thal"
names(database)[names(database) == "target"] <- "target"
Missing Values plays an important role in statistics and data analysis. Often, missing values must not be ignored, but rather they should be carefully studied to see if there is an underlying pattern or cause for their missingness.
ncol(database)
## [1] 14
nrow(database)
## [1] 303
table(is.na.data.frame(database))
##
## FALSE
## 4242
Using factors with labels is better than using integers because factors are self-describing. Have a variable that has values “Male” and “Female” is better than 1 and 2.
data_heart <- database
data_heart$target <- as.factor(data_heart$target)
database$sex <- factor(database$sex, levels=c(0, 1), labels=c("female", "male"), ordered = TRUE)
database$chest_pain_type <- factor(database$chest_pain_type, levels=c(0, 1, 2, 3), labels=c("typical angina", "atypical angina", "non-anginal pain", "asymptomatic"), ordered = TRUE)
database$fasting_blood_sugar <- factor(database$fasting_blood_sugar, levels=c(0, 1), labels=c(FALSE, TRUE), ordered = TRUE)
database$resting_cardiographic_results <- factor(database$resting_cardiographic_results, levels=c(0, 1, 2), labels=c("normal", "having wave abnormality", "probable ventricular hypertrophy"), ordered = TRUE )
database$exercise_induced_angina <- factor(database$exercise_induced_angina, levels=c(0, 1), labels=c("No", "Yes"), ordered = TRUE)
database$slope_peak_exercise <- factor(database$slope_peak_exercise, levels=c(0, 1, 2), labels=c("upsloping", "flat", "downsloping"), ordered = TRUE)
database$number_major_vessels <- as.numeric(database$number_major_vessels)
database$thal <- as.numeric(database$thal)
database$target <- factor(database$target, levels=c(0, 1), labels=c("absence", "presence"), ordered = TRUE)
STR
str(database)
## 'data.frame': 303 obs. of 14 variables:
## $ age : int 63 37 41 56 57 57 56 44 52 57 ...
## $ sex : Ord.factor w/ 2 levels "female"<"male": 2 2 1 2 1 2 1 2 2 2 ...
## $ chest_pain_type : Ord.factor w/ 4 levels "typical angina"<..: 4 3 2 2 1 1 2 2 3 3 ...
## $ resting_blood_pressure : int 145 130 130 120 120 140 140 120 172 150 ...
## $ serum_cholestoral : int 233 250 204 236 354 192 294 263 199 168 ...
## $ fasting_blood_sugar : Ord.factor w/ 2 levels "FALSE"<"TRUE": 2 1 1 1 1 1 1 1 2 1 ...
## $ resting_cardiographic_results : Ord.factor w/ 3 levels "normal"<"having wave abnormality"<..: 1 2 1 2 2 2 1 2 2 2 ...
## $ maximum_heart_rate : int 150 187 172 178 163 148 153 173 162 174 ...
## $ exercise_induced_angina : Ord.factor w/ 2 levels "No"<"Yes": 1 1 1 1 2 1 1 1 1 1 ...
## $ depression_induced_exercise_rest: num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
## $ slope_peak_exercise : Ord.factor w/ 3 levels "upsloping"<"flat"<..: 1 1 3 3 3 2 2 3 3 3 ...
## $ number_major_vessels : num 0 0 0 0 0 0 0 0 0 0 ...
## $ thal : num 1 2 2 2 2 1 2 3 3 2 ...
## $ target : Ord.factor w/ 2 levels "absence"<"presence": 2 2 2 2 2 2 2 2 2 2 ...
Summary
summary(database)
## age sex chest_pain_type
## Min. :29.00 female: 96 typical angina :143
## 1st Qu.:47.50 male :207 atypical angina : 50
## Median :55.00 non-anginal pain: 87
## Mean :54.37 asymptomatic : 23
## 3rd Qu.:61.00
## Max. :77.00
## resting_blood_pressure serum_cholestoral fasting_blood_sugar
## Min. : 94.0 Min. :126.0 FALSE:258
## 1st Qu.:120.0 1st Qu.:211.0 TRUE : 45
## Median :130.0 Median :240.0
## Mean :131.6 Mean :246.3
## 3rd Qu.:140.0 3rd Qu.:274.5
## Max. :200.0 Max. :564.0
## resting_cardiographic_results maximum_heart_rate
## normal :147 Min. : 71.0
## having wave abnormality :152 1st Qu.:133.5
## probable ventricular hypertrophy: 4 Median :153.0
## Mean :149.6
## 3rd Qu.:166.0
## Max. :202.0
## exercise_induced_angina depression_induced_exercise_rest
## No :204 Min. :0.00
## Yes: 99 1st Qu.:0.00
## Median :0.80
## Mean :1.04
## 3rd Qu.:1.60
## Max. :6.20
## slope_peak_exercise number_major_vessels thal target
## upsloping : 21 Min. :0.0000 Min. :0.000 absence :138
## flat :140 1st Qu.:0.0000 1st Qu.:2.000 presence:165
## downsloping:142 Median :0.0000 Median :2.000
## Mean :0.7294 Mean :2.314
## 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :4.0000 Max. :3.000
Outliers in data can distort predictions and affect the accuracy, if you don’t detect and handle them appropriately especially in regression models. Why outliers treatment is important? Because, it can drastically bias/change the fit estimates and predictions. A handy explanation of Outiliers it can be found in : https://www.r-bloggers.com/outlier-detection-and-treatment-with-r/
For a given continuous variable, outliers are those observations that lie outside 1.5 * IQR, where IQR, the ‘Inter Quartile Range’ is the difference between 75th and 25th quartiles. Look at the points outside the whiskers in below box plot.
Age
summary(database$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29.00 47.50 55.00 54.37 61.00 77.00
outlier_values <- boxplot.stats(database$age)$out # outlier values.
boxplot(database$age, main="Detect Outliers - age", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
Sex
summary(database$sex)
## female male
## 96 207
outlier_values <- boxplot.stats(database$sex)$out # outlier values.
## Warning in Ops.ordered(x[floor(d)], x[ceiling(d)]): '+' is not meaningful
## for ordered factors
boxplot(database$sex, main="Detect Outliers - sex", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
Chest pain type
summary(database$chest_pain_type)
## typical angina atypical angina non-anginal pain asymptomatic
## 143 50 87 23
outlier_values <- boxplot.stats(database$chest_pain_type)$out # outlier values.
## Warning in Ops.ordered(x[floor(d)], x[ceiling(d)]): '+' is not meaningful
## for ordered factors
boxplot(database$chest_pain_type, main="Detect Outliers - chest_pain_type", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
Resting blood pressure
summary(database$resting_blood_pressure)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 94.0 120.0 130.0 131.6 140.0 200.0
outlier_values <- boxplot.stats(database$resting_blood_pressure)$out # outlier values.
boxplot(database$resting_blood_pressure, main="Detect Outliers - resting_blood_pressure", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
Serum cholestoral
summary(database$serum_cholestoral)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 126.0 211.0 240.0 246.3 274.5 564.0
outlier_values <- boxplot.stats(database$serum_cholestoral)$out # outlier values.
boxplot(database$serum_cholestoral, main="Detect Outliers - serum_cholestoral", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
Fasting blood sugar
summary(database$fasting_blood_sugar)
## FALSE TRUE
## 258 45
outlier_values <- boxplot.stats(database$fasting_blood_sugar)$out # outlier values.
## Warning in Ops.ordered(x[floor(d)], x[ceiling(d)]): '+' is not meaningful
## for ordered factors
boxplot(database$fasting_blood_sugar, main="Detect Outliers - fasting_blood_sugar", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
Resting cardiographic results
summary(database$resting_cardiographic_results)
## normal having wave abnormality
## 147 152
## probable ventricular hypertrophy
## 4
outlier_values <- boxplot.stats(database$resting_cardiographic_results)$out # outlier values.
## Warning in Ops.ordered(x[floor(d)], x[ceiling(d)]): '+' is not meaningful
## for ordered factors
boxplot(database$resting_cardiographic_results, main="Detect Outliers - resting_cardiographic_results", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
Maximum heart rate
summary(database$maximum_heart_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 71.0 133.5 153.0 149.6 166.0 202.0
outlier_values <- boxplot.stats(database$maximum_heart_rate)$out # outlier values.
boxplot(database$maximum_heart_rate, main="Detect Outliers - maximum_heart_rate", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
Exercise induced angina
summary(database$exercise_induced_angina)
## No Yes
## 204 99
outlier_values <- boxplot.stats(database$exercise_induced_angina)$out # outlier values.
## Warning in Ops.ordered(x[floor(d)], x[ceiling(d)]): '+' is not meaningful
## for ordered factors
boxplot(database$exercise_induced_angina, main="Detect Outliers - exercise_induced_angina", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
Depression induced exercise rest
summary(database$depression_induced_exercise_rest)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.80 1.04 1.60 6.20
outlier_values <- boxplot.stats(database$depression_induced_exercise_rest)$out # outlier values.
boxplot(database$depression_induced_exercise_rest, main="Detect Outliers - depression_induced_exercise_rest", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
Slope peak exercise
summary(database$slope_peak_exercise)
## upsloping flat downsloping
## 21 140 142
outlier_values <- boxplot.stats(database$slope_peak_exercise)$out # outlier values.
## Warning in Ops.ordered(x[floor(d)], x[ceiling(d)]): '+' is not meaningful
## for ordered factors
boxplot(database$slope_peak_exercise, main="Detect Outliers - slope_peak_exercise", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
Number major vessels
summary(database$number_major_vessels)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.7294 1.0000 4.0000
outlier_values <- boxplot.stats(database$number_major_vessels)$out # outlier values.
boxplot(database$number_major_vessels, main="Detect Outliers - number_major_vessels", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
thal
summary(database$thal)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 2.000 2.314 3.000 3.000
outlier_values <- boxplot.stats(database$thal)$out # outlier values.
boxplot(database$thal, main="Detect Outliers - thal", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
Target
summary(database$target)
## absence presence
## 138 165
outlier_values <- boxplot.stats(database$target)$out # outlier values.
## Warning in Ops.ordered(x[floor(d)], x[ceiling(d)]): '+' is not meaningful
## for ordered factors
boxplot(database$target, main="Detect Outliers - target", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
Let’s to try a bivariate approach, so that we can visualize in box-plot of the X and Y, for categorical X’ (target).
Resting blood pressure
boxplot(resting_blood_pressure ~ target, data=database, main="resting_blood_pressure (continuos var) vs target")
Serum cholestoral
boxplot(serum_cholestoral ~ target, data=database, main="serum_cholestoral (continuos var) vs target")
Maximum heart Rate
boxplot(maximum_heart_rate ~ target, data=database, main="maximum_heart_rate (continuos var) vs target")
Depression induced exercise rest
boxplot(depression_induced_exercise_rest ~ target, data=database, main="depression_induced_exercise_rest (continuos var) vs target")
It diagnoses predictors that have one unique value (i.e. are zero variance predictors) or predictors that are have both of the following characteristics: they have very few unique values relative to the number of samples and the ratio of the frequency of the most common value to the frequency of the second most common value is large
table(nearZeroVar(data_heart))
## < table of extent 0 >
Statistical Methods for Exploratory Analysis. Theses methods include clustering and dimension reduction techiques. It comes to visualization a high dimensional or multidimensional data. Clustering organizes things that are close into groups. Probably, the most kind of familiar distance metric is the Euclidean distance which is just kind of the stright-line distance between any two points.
hc <- hclust(dist(data_heart, method = "euclidean"), method="complete")
plot(hc)
Hierarchical Clustering give us an ideia of the relationship between variables or observation.
plot(as.dendrogram(hc))
They help us find patterns in data and understand its properties. They suggest modeling strategies to communicate results. - Show Comparisons. - Show causality, mechanism, explanation, systematic structure. - Shom multivariate data. - Integrate multiple models of evidence.
png('plot_machine_learning.png')
par(mfrow=c(2,2))
plot(x = database$target, y = database$age, col = "black", type = "l", xlab = "Target", ylab = "Age", plot = TRUE)
plot(x = database$target, y = database$sex, col = "black", type = "l", xlab = "Target", ylab = "Sex", plot = TRUE)
## Warning in rect(xleft, ybottom, xright, ytop, col = col, ...): graphical
## parameter "type" is obsolete
## Warning in rect(xleft, ybottom, xright, ytop, col = col, ...): "plot" is
## not a graphical parameter
plot(x = database$target , y = database$resting_blood_pressure, col = "black", type = "l", xlab = "", ylab = "Outliers", plot = TRUE)
lines(x = database$target, y = database$serum_cholestoral, col = "red")
lines(x = database$target, y = database$maximum_heart_rate, col = "blue")
legend("topright", lty = "solid", col = c("black", "red", "blue"), legend = c("resting_blood_pressure", "serum_cholestoral", "maximum_heart_rate"), bty = "n")
df <- select(data_heart, target, depression_induced_exercise_rest)
ggplot(df, aes(x=depression_induced_exercise_rest)) + geom_histogram(binwidth=1)
## png
## 3
Always set the random number seed when conducting a simulation.
set.seed(1)
The test set is used for assessment of the generalization error of the final chosen model. Ideally, the test set should be kept in a “vault,” and be brought out only at the end of the data analysis. It generates randomly the indices for test base. We split out 30% for testing and 70% for training.
indexes = sample(1:nrow(data_heart), size=0.3*nrow(data_heart))
train = data_heart[-indexes,]
test = data_heart[indexes,]
system.time(svm_model <- svm(target ~., train, probability =T))
## user system elapsed
## 0.11 0.00 0.13
predictionsSVM <- predict(svm_model, test, probability =T)
table(predictionsSVM,test$target)
##
## predictionsSVM 0 1
## 0 33 6
## 1 10 41
acuracy = 1 - mean(predictionsSVM != test$target)
acuracy
## [1] 0.8222222
summary(svm_model)
##
## Call:
## svm(formula = target ~ ., data = train, probability = T)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.07692308
##
## Number of Support Vectors: 132
##
## ( 67 65 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
probabilities = attr(predictionsSVM, "probabilities")
predictionsAndProbabilities = cbind(test$target, predictionsSVM, probabilities)
View(predictionsAndProbabilities)
Metric that evaluates the level of agreement of a classification task
cm = table(predictionsSVM,test$target); cm
##
## predictionsSVM 0 1
## 0 33 6
## 1 10 41
kappa = confusionMatrix(cm)$overall[2]; kappa
## Kappa
## 0.6423249
Confusion MatriX
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## predictionsSVM 0 1
## 0 33 6
## 1 10 41
##
## Accuracy : 0.8222
## 95% CI : (0.7274, 0.8948)
## No Information Rate : 0.5222
## P-Value [Acc > NIR] : 2.711e-09
##
## Kappa : 0.6423
##
## Mcnemar's Test P-Value : 0.4533
##
## Sensitivity : 0.7674
## Specificity : 0.8723
## Pos Pred Value : 0.8462
## Neg Pred Value : 0.8039
## Prevalence : 0.4778
## Detection Rate : 0.3667
## Detection Prevalence : 0.4333
## Balanced Accuracy : 0.8199
##
## 'Positive' Class : 0
##
system.time(tree_model <- tree(target ~., train))
## user system elapsed
## 0.03 0.00 0.03
predictionsDtree <- predict(tree_model, test, type = "class")
table(predictionsDtree, test$target)
##
## predictionsDtree 0 1
## 0 34 12
## 1 9 35
acuracy = 1 - mean(predictionsDtree != test$target)
acuracy
## [1] 0.7666667
summary(tree_model)
##
## Classification tree:
## tree(formula = target ~ ., data = train)
## Variables actually used in tree construction:
## [1] "number_major_vessels" "thal"
## [3] "maximum_heart_rate" "exercise_induced_angina"
## [5] "depression_induced_exercise_rest" "serum_cholestoral"
## [7] "age" "chest_pain_type"
## Number of terminal nodes: 18
## Residual mean deviance: 0.3997 = 77.94 / 195
## Misclassification error rate: 0.1033 = 22 / 213
plot(tree_model)
Metric that evaluates the level of agreement of a classification task
cm = table(predictionsDtree,test$target); cm
##
## predictionsDtree 0 1
## 0 34 12
## 1 9 35
kappa = confusionMatrix(cm)$overall[2]; kappa
## Kappa
## 0.5337938
Confusion MatriX
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## predictionsDtree 0 1
## 0 34 12
## 1 9 35
##
## Accuracy : 0.7667
## 95% CI : (0.6657, 0.8494)
## No Information Rate : 0.5222
## P-Value [Acc > NIR] : 1.559e-06
##
## Kappa : 0.5338
##
## Mcnemar's Test P-Value : 0.6625
##
## Sensitivity : 0.7907
## Specificity : 0.7447
## Pos Pred Value : 0.7391
## Neg Pred Value : 0.7955
## Prevalence : 0.4778
## Detection Rate : 0.3778
## Detection Prevalence : 0.5111
## Balanced Accuracy : 0.7677
##
## 'Positive' Class : 0
##
system.time(forest_model <- randomForest(target ~., data = train,
importance = TRUE,
do.trace = 100))
## ntree OOB 1 2
## 100: 16.43% 24.21% 10.17%
## 200: 15.96% 24.21% 9.32%
## 300: 15.96% 22.11% 11.02%
## 400: 15.02% 22.11% 9.32%
## 500: 16.43% 23.16% 11.02%
## user system elapsed
## 0.60 0.02 0.62
predictionsForest = predict(forest_model, test)
table(predictionsForest, test$target)
##
## predictionsForest 0 1
## 0 33 8
## 1 10 39
acuracy = 1 - mean(predictionsForest != test$target)
acuracy
## [1] 0.8
plot(forest_model)
legend("topright", legend=c("OOB", "0", "1"),
col=c("black", "red", "green"), lty=1:1, cex=0.8)
Two measures of importance to rank attributes.
varImpPlot(forest_model)
Metric that evaluates the level of agreement of a classification task
cm = table(predictionsForest,test$target); cm
##
## predictionsForest 0 1
## 0 33 8
## 1 10 39
kappa = confusionMatrix(cm)$overall[2]; kappa
## Kappa
## 0.5984135
Confusion MatriX
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## predictionsForest 0 1
## 0 33 8
## 1 10 39
##
## Accuracy : 0.8
## 95% CI : (0.7025, 0.8769)
## No Information Rate : 0.5222
## P-Value [Acc > NIR] : 4.197e-08
##
## Kappa : 0.5984
##
## Mcnemar's Test P-Value : 0.8137
##
## Sensitivity : 0.7674
## Specificity : 0.8298
## Pos Pred Value : 0.8049
## Neg Pred Value : 0.7959
## Prevalence : 0.4778
## Detection Rate : 0.3667
## Detection Prevalence : 0.4556
## Balanced Accuracy : 0.7986
##
## 'Positive' Class : 0
##
The principal components are equal to the right singular values if you first scale(Subtract the mean, divide by the standard deviation) the variables. We do not use the “target” column when pca is applied.
train_pca <- train[-14]
train_pca <- scale(train_pca)
pca <- princomp(train_pca)
summary(pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.6640231 1.2209697 1.1144255 1.09137052 1.03006301
## Proportion of Variance 0.2140026 0.1152153 0.0959848 0.09205446 0.08200267
## Cumulative Proportion 0.2140026 0.3292179 0.4252027 0.51725718 0.59925984
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.96735647 0.92389158 0.85801991 0.84872924
## Proportion of Variance 0.07232251 0.06596938 0.05689775 0.05567224
## Cumulative Proportion 0.67158235 0.73755173 0.79444949 0.85012173
## Comp.10 Comp.11 Comp.12 Comp.13
## Standard deviation 0.80293715 0.69923889 0.65288376 0.61592998
## Proportion of Variance 0.04982686 0.03778779 0.03294368 0.02931994
## Cumulative Proportion 0.89994859 0.93773638 0.97068006 1.00000000
Pca’s results (standard deviation, proportional variance and proportion of cumulative variance)
plot(pca)
We are going to use only The principal components that maintain a cumulative variance of 100% of the total.
vars <- pca$sdev^2
vars <- vars/sum(vars)
cumulativeVariance <- cumsum(vars)
Variance 0.95. Only the first 11 columns.
View(as.data.frame(cumulativeVariance))
train_pca <- pca$scores[,1:11]
train_pca = as.data.frame(train_pca)
train_pca = cbind(train_pca, train$target)
colnames(train_pca)[ncol(train_pca)] <- "target"
system.time(svm_model_pca <- svm(target ~., train_pca, probability =T))
## user system elapsed
## 0.11 0.00 0.11
predictionsSVM_PCA <- predict(svm_model_pca, train_pca, probability =T)
table(predictionsSVM_PCA,train_pca$target)
##
## predictionsSVM_PCA 0 1
## 0 86 6
## 1 9 112
acuracy = 1 - mean(predictionsSVM_PCA != train_pca$target)
acuracy
## [1] 0.9295775
summary(svm_model_pca)
##
## Call:
## svm(formula = target ~ ., data = train_pca, probability = T)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.09090909
##
## Number of Support Vectors: 138
##
## ( 68 70 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
probabilities = attr(predictionsSVM_PCA, "probabilities")
predictionsAndProbabilities = cbind(test$target, predictionsSVM_PCA, probabilities)
## Warning in cbind(test$target, predictionsSVM_PCA, probabilities): number of
## rows of result is not a multiple of vector length (arg 1)
View(predictionsAndProbabilities)
Metric that evaluates the level of agreement of a classification task
cm = table(predictionsSVM_PCA,train_pca$target); cm
##
## predictionsSVM_PCA 0 1
## 0 86 6
## 1 9 112
kappa = confusionMatrix(cm)$overall[2]; kappa
## Kappa
## 0.8570534
Confusion MatriX
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## predictionsSVM_PCA 0 1
## 0 86 6
## 1 9 112
##
## Accuracy : 0.9296
## 95% CI : (0.8865, 0.9601)
## No Information Rate : 0.554
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8571
##
## Mcnemar's Test P-Value : 0.6056
##
## Sensitivity : 0.9053
## Specificity : 0.9492
## Pos Pred Value : 0.9348
## Neg Pred Value : 0.9256
## Prevalence : 0.4460
## Detection Rate : 0.4038
## Detection Prevalence : 0.4319
## Balanced Accuracy : 0.9272
##
## 'Positive' Class : 0
##