library(psych)
library(randomForest) #for classification algorithm
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
library(tidyverse) #for data wragling
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## %+%(): ggplot2, psych
## alpha(): ggplot2, psych
## combine(): dplyr, randomForest
## filter(): dplyr, stats
## lag(): dplyr, stats
## margin(): ggplot2, randomForest
iris_data <- datasets::iris
summary(iris_data)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
Check the data type for each columns as a convertion to better understanding of how many discreate and continuous variable is the dataframe. in R we use str()
str(iris_data)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
From the display above, we understand that sepal.lenght,sepal.width, petal.lenght, and peta.width are all numeric while the species is factor(categorical).
pairs.panels(iris_data,gap=0)
cor(iris_data[c(1,2,3,4)])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
## Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
## Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
## Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
cor.test(iris_data$Sepal.Length, iris_data$Petal.Length)
##
## Pearson's product-moment correlation
##
## data: iris_data$Sepal.Length and iris_data$Petal.Length
## t = 21.646, df = 148, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8270363 0.9055080
## sample estimates:
## cor
## 0.8717538
cor.test(iris_data$Sepal.Length, iris_data$Sepal.Width)
##
## Pearson's product-moment correlation
##
## data: iris_data$Sepal.Length and iris_data$Sepal.Width
## t = -1.4403, df = 148, p-value = 0.1519
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.27269325 0.04351158
## sample estimates:
## cor
## -0.1175698
cor.test(iris_data$Petal.Length, iris_data$Petal.Width)
##
## Pearson's product-moment correlation
##
## data: iris_data$Petal.Length and iris_data$Petal.Width
## t = 43.387, df = 148, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9490525 0.9729853
## sample estimates:
## cor
## 0.9628654
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
rcorr(as.matrix(iris_data[c(1,2,3,4)])) #compared p-values
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.00 -0.12 0.87 0.82
## Sepal.Width -0.12 1.00 -0.43 -0.37
## Petal.Length 0.87 -0.43 1.00 0.96
## Petal.Width 0.82 -0.37 0.96 1.00
##
## n= 150
##
##
## P
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.1519 0.0000 0.0000
## Sepal.Width 0.1519 0.0000 0.0000
## Petal.Length 0.0000 0.0000 0.0000
## Petal.Width 0.0000 0.0000 0.0000
M <- iris_data[,-5]
L <- iris_data[,5]
#train the model
iris_rf <- randomForest(M,L)
iris_rf
##
## Call:
## randomForest(x = M, y = L)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.67%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 50 0 0 0.00
## versicolor 0 47 3 0.06
## virginica 0 4 46 0.08
iris_rf$importance
## MeanDecreaseGini
## Sepal.Length 10.064307
## Sepal.Width 2.724958
## Petal.Length 40.177059
## Petal.Width 46.253903