# install.packages('MASS')
library(MASS)
data(Boston)
# Dimenson of the data
dim(Boston)
## [1] 506 14
#Draw a matrix of scatterplots for the varaibles
pairs(Boston)
#Draw the correlation plots
library(corrplot)
## corrplot 0.95 loaded
correlationvalue = cor(Boston)
corrplot.mixed(correlationvalue, order = 'AOE')
par(mfrow=c(3,2))
plot(Boston$age, Boston$crim)
# Older homes, more crime
plot(Boston$nox, Boston$crim)
# middle value of nox, more crime
plot(Boston$dis, Boston$crim)
# Closer to work-area, more crime
plot(Boston$rad, Boston$crim)
# Higher index of accessibility to radial highways, more crime
plot(Boston$tax, Boston$crim)
# Higher tax rate, more crime
plot(Boston$ptratio, Boston$crim)
par(mfrow=c(1,3))
hist(Boston$crim[Boston$crim>1], breaks=20, freq = T, main="Crime More than 1")
# The crime rates are heavily right skewed, indicating that most cities have low crime rates,
# but there are a few citites having high crimial rate, which could reach to above 80.
hist(Boston$tax, breaks=20, freq = T, main="Tax")
# There is a large divide between suburbs with low tax rates and a peak at 600-700
hist(Boston$ptratio, breaks=20, freq = T, main="Ptratio")
library(e1071)
skewness(Boston$ptratio)
## [1] -0.7975743
nrow(subset(Boston, chas == 1))
## [1] 35
median(Boston$ptratio)
## [1] 19.05
to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.
library(mlbench)
data(Soybean)
str(Soybean)
## 'data.frame': 683 obs. of 36 variables:
## $ Class : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ date : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
## $ plant.stand : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
## $ precip : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ temp : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
## $ hail : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## $ crop.hist : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
## $ area.dam : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
## $ sever : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
## $ seed.tmt : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
## $ germ : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
## $ plant.growth : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaves : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaf.halo : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.marg : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.size : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.shread : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.malf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.mild : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ stem : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ lodging : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
## $ stem.cankers : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
## $ canker.lesion : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
## $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ext.decay : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ mycelium : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ int.discolor : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ sclerotia : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.pods : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.spots : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
## $ seed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ mold.growth : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.discolor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.size : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ shriveling : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ roots : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:e1071':
##
## element
## Loading required package: lattice
zero_cols = nearZeroVar( Soybean )
colnames( Soybean )[ zero_cols ]
## [1] "leaf.mild" "mycelium" "sclerotia"
Soybean1 = Soybean[,-zero_cols]
Soybean2 = Soybean[, zero_cols]
head(Soybean1)
## Class date plant.stand precip temp hail crop.hist area.dam
## 1 diaporthe-stem-canker 6 0 2 1 0 1 1
## 2 diaporthe-stem-canker 4 0 2 1 0 2 0
## 3 diaporthe-stem-canker 3 0 2 1 0 1 0
## 4 diaporthe-stem-canker 3 0 2 1 0 1 0
## 5 diaporthe-stem-canker 6 0 2 1 0 2 0
## 6 diaporthe-stem-canker 5 0 2 1 0 3 0
## sever seed.tmt germ plant.growth leaves leaf.halo leaf.marg leaf.size
## 1 1 0 0 1 1 0 2 2
## 2 2 1 1 1 1 0 2 2
## 3 2 1 2 1 1 0 2 2
## 4 2 0 1 1 1 0 2 2
## 5 1 0 2 1 1 0 2 2
## 6 1 0 1 1 1 0 2 2
## leaf.shread leaf.malf stem lodging stem.cankers canker.lesion fruiting.bodies
## 1 0 0 1 1 3 1 1
## 2 0 0 1 0 3 1 1
## 3 0 0 1 0 3 0 1
## 4 0 0 1 0 3 0 1
## 5 0 0 1 0 3 1 1
## 6 0 0 1 0 3 0 1
## ext.decay int.discolor fruit.pods fruit.spots seed mold.growth seed.discolor
## 1 1 0 0 4 0 0 0
## 2 1 0 0 4 0 0 0
## 3 1 0 0 4 0 0 0
## 4 1 0 0 4 0 0 0
## 5 1 0 0 4 0 0 0
## 6 1 0 0 4 0 0 0
## seed.size shriveling roots
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
#frequency distribution of each predictor
par(mfrow = c(3,3))
for(i in 2:ncol(Soybean)) {
plot(Soybean[i], main = colnames(Soybean[i]))}
#Near Zero Variance variables
nearZeroVar(Soybean)
## [1] 19 26 28
names(Soybean[nearZeroVar(Soybean)])
## [1] "leaf.mild" "mycelium" "sclerotia"
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr(Soybean, col=c('navyblue','red'),
numbers=TRUE, sortVars=TRUE,
combined = TRUE,
prop = FALSE,
only.miss = TRUE,
labels=names(Soybean),
cex.axis=.7, gap=3,
ylab=c("Missing Data"))
##
## Variables sorted by number of missings:
## Variable Count
## hail 121
## sever 121
## seed.tmt 121
## lodging 121
## germ 112
## leaf.mild 108
## fruiting.bodies 106
## fruit.spots 106
## seed.discolor 106
## shriveling 106
## leaf.shread 100
## seed 92
## mold.growth 92
## seed.size 92
## leaf.halo 84
## leaf.marg 84
## leaf.size 84
## leaf.malf 84
## fruit.pods 84
## precip 38
## stem.cankers 38
## canker.lesion 38
## ext.decay 38
## mycelium 38
## int.discolor 38
## sclerotia 38
## plant.stand 36
## roots 31
## temp 30
## crop.hist 16
## plant.growth 16
## stem 16
## date 1
## area.dam 1
## Class 0
## leaves 0
A strategy for handling the missing data in the data set is to remove the predictors with excessive missingness and impute the remaining missing values. Several of the predictors have missing proportions around 15-18%, so these variables can be excluded. For the remaining categorical predictors, the missing values can be imputed using the most frequent category (mode). This approach retains most observations while reducing the impact of missing data.
We can use the imputation method availabe in the mice package as follows
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
set.seed(1)
missingSoybeans <- mice(Soybean, method = "pmm", printFlag = F, seed =112)
## Warning: Number of logged events: 1664
summary(missingSoybeans)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## Class date plant.stand precip temp
## "" "pmm" "pmm" "pmm" "pmm"
## hail crop.hist area.dam sever seed.tmt
## "pmm" "pmm" "pmm" "pmm" "pmm"
## germ plant.growth leaves leaf.halo leaf.marg
## "pmm" "pmm" "" "pmm" "pmm"
## leaf.size leaf.shread leaf.malf leaf.mild stem
## "pmm" "pmm" "pmm" "pmm" "pmm"
## lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## "pmm" "pmm" "pmm" "pmm" "pmm"
## mycelium int.discolor sclerotia fruit.pods fruit.spots
## "pmm" "pmm" "pmm" "pmm" "pmm"
## seed mold.growth seed.discolor seed.size shriveling
## "pmm" "pmm" "pmm" "pmm" "pmm"
## roots
## "pmm"
## PredictorMatrix:
## Class date plant.stand precip temp hail crop.hist area.dam sever
## Class 0 1 1 1 1 1 1 1 1
## date 1 0 1 1 1 1 1 1 1
## plant.stand 1 1 0 1 1 1 1 1 1
## precip 1 1 1 0 1 1 1 1 1
## temp 1 1 1 1 0 1 1 1 1
## hail 1 1 1 1 1 0 1 1 1
## seed.tmt germ plant.growth leaves leaf.halo leaf.marg leaf.size
## Class 1 1 1 1 1 1 1
## date 1 1 1 1 1 1 1
## plant.stand 1 1 1 1 1 1 1
## precip 1 1 1 1 1 1 1
## temp 1 1 1 1 1 1 1
## hail 1 1 1 1 1 1 1
## leaf.shread leaf.malf leaf.mild stem lodging stem.cankers
## Class 1 1 1 1 1 1
## date 1 1 1 1 1 1
## plant.stand 1 1 1 1 1 1
## precip 1 1 1 1 1 1
## temp 1 1 1 1 1 1
## hail 1 1 1 1 1 1
## canker.lesion fruiting.bodies ext.decay mycelium int.discolor
## Class 1 1 1 1 1
## date 1 1 1 1 1
## plant.stand 1 1 1 1 1
## precip 1 1 1 1 1
## temp 1 1 1 1 1
## hail 1 1 1 1 1
## sclerotia fruit.pods fruit.spots seed mold.growth seed.discolor
## Class 1 1 1 1 1 1
## date 1 1 1 1 1 1
## plant.stand 1 1 1 1 1 1
## precip 1 1 1 1 1 1
## temp 1 1 1 1 1 1
## hail 1 1 1 1 1 1
## seed.size shriveling roots
## Class 1 1 1
## date 1 1 1
## plant.stand 1 1 1
## precip 1 1 1
## temp 1 1 1
## hail 1 1 1
## Number of logged events: 1664
## it im dep meth
## 1 1 1 plant.stand pmm
## 2 1 1 plant.stand pmm
## 3 1 1 precip pmm
## 4 1 1 precip pmm
## 5 1 1 temp pmm
## 6 1 1 hail pmm
## out
## 1 Classbrown-spot, Classcyst-nematode, Classdiaporthe-pod-&-stem-blight, int.discolor1
## 2 mice detected that your data are (nearly) multi-collinear.\nIt applied a ridge penalty to continue calculations, but the results can be unstable.\nDoes your dataset contain duplicates, linear transformation, or factors with unique respondent names?
## 3 Classbrown-spot, Classcharcoal-rot, Classcyst-nematode, Classdiaporthe-stem-canker, Classherbicide-injury, fruit.pods1
## 4 mice detected that your data are (nearly) multi-collinear.\nIt applied a ridge penalty to continue calculations, but the results can be unstable.\nDoes your dataset contain duplicates, linear transformation, or factors with unique respondent names?
## 5 Classbrown-stem-rot, Classcyst-nematode, int.discolor1, sclerotia1
## 6 Classbrown-spot, Classcharcoal-rot, Classcyst-nematode, Classdiaporthe-pod-&-stem-blight, Classdiaporthe-stem-canker, Classherbicide-injury, leaf.mild2, lodging1, stem.cankers2, ext.decay2, fruit.pods2, fruit.spots1, seed.discolor1
#Finally, we can visualize after imputation to see if missing data remains
md.pattern(missingSoybeans$predictorMatrix)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## Class date plant.stand precip temp hail crop.hist area.dam sever seed.tmt
## 36 1 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0 0
## germ plant.growth leaves leaf.halo leaf.marg leaf.size leaf.shread leaf.malf
## 36 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 36 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth
## 36 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## seed.discolor seed.size shriveling roots
## 36 1 1 1 1 0
## 0 0 0 0 0
More about Imputation: The process of estimating missing data points is called imputation. This can be done in many different ways, and it is always true that the best method depends on the problem.
We have seen that sometimes people delete any indicator or unit that has missing values, although this is not the right way, and in many cases, this can be too restrictive. Ideally, one can obtain reasonable results despite small amounts of missing data, but if too much data is missing, the uncertainty can be too high to give a meaningful analysis. As usual, there should be a balance in the way missing data is handled.
Although there are multiple methods of imputation, the indicator method is commonly used. In this method, we can use data from other indicators to estimate the missing point. The core idea here is that if there is a relation between indicators, it is highly probable to guess the missing data points by using known values of other indicators. This can take the form of: a) Simply substituting the mean or median of the normalized values of the other indicators. b) Substituting the mean or median of normalized values of the other indicators within the aggregation group. c) Using a more formal approach based on regression or, more generally, on statistical modeling.
library(caret)
data(BloodBrain)
str(logBBB)
## num [1:208] 1.08 -0.4 0.22 0.14 0.69 0.44 -0.43 1.38 0.75 0.88 ...
str(bbbDescr)
## 'data.frame': 208 obs. of 134 variables:
## $ tpsa : num 12 49.3 50.5 37.4 37.4 ...
## $ nbasic : int 1 0 1 0 1 1 1 1 1 1 ...
## $ negative : int 0 0 0 0 0 0 0 0 0 0 ...
## $ vsa_hyd : num 167.1 92.6 295.2 319.1 299.7 ...
## $ a_aro : int 0 6 15 15 12 11 6 12 12 6 ...
## $ weight : num 156 151 366 383 326 ...
## $ peoe_vsa.0 : num 76.9 38.2 58.1 62.2 74.8 ...
## $ peoe_vsa.1 : num 43.4 25.5 124.7 124.7 118 ...
## $ peoe_vsa.2 : num 0 0 21.7 13.2 33 ...
## $ peoe_vsa.3 : num 0 8.62 8.62 21.79 0 ...
## $ peoe_vsa.4 : num 0 23.3 17.4 0 0 ...
## $ peoe_vsa.5 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ peoe_vsa.6 : num 17.24 0 8.62 8.62 8.62 ...
## $ peoe_vsa.0.1 : num 18.7 49 83.8 83.8 83.8 ...
## $ peoe_vsa.1.1 : num 43.5 0 49 68.8 36.8 ...
## $ peoe_vsa.2.1 : num 0 0 0 0 0 ...
## $ peoe_vsa.3.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ peoe_vsa.4.1 : num 0 0 5.68 5.68 5.68 ...
## $ peoe_vsa.5.1 : num 0 13.567 2.504 0 0.137 ...
## $ peoe_vsa.6.1 : num 0 7.9 2.64 2.64 2.5 ...
## $ a_acc : int 0 2 2 2 2 2 2 2 0 2 ...
## $ a_acid : int 0 0 0 0 0 0 0 0 0 0 ...
## $ a_base : int 1 0 1 1 1 1 1 1 1 1 ...
## $ vsa_acc : num 0 13.57 8.19 8.19 8.19 ...
## $ vsa_acid : num 0 0 0 0 0 0 0 0 0 0 ...
## $ vsa_base : num 5.68 0 0 0 0 ...
## $ vsa_don : num 5.68 5.68 5.68 5.68 5.68 ...
## $ vsa_other : num 0 28.1 43.6 28.3 19.6 ...
## $ vsa_pol : num 0 13.6 0 0 0 ...
## $ slogp_vsa0 : num 18 25.4 14.1 14.1 14.1 ...
## $ slogp_vsa1 : num 0 23.3 34.8 34.8 34.8 ...
## $ slogp_vsa2 : num 3.98 23.86 0 0 0 ...
## $ slogp_vsa3 : num 0 0 76.2 76.2 76.2 ...
## $ slogp_vsa4 : num 4.41 0 3.19 3.19 3.19 ...
## $ slogp_vsa5 : num 32.9 0 9.51 0 0 ...
## $ slogp_vsa6 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ slogp_vsa7 : num 0 70.6 148.1 144 140.7 ...
## $ slogp_vsa8 : num 113.2 0 75.5 75.5 75.5 ...
## $ slogp_vsa9 : num 33.3 41.3 28.3 55.5 26 ...
## $ smr_vsa0 : num 0 23.86 12.63 3.12 3.12 ...
## $ smr_vsa1 : num 18 25.4 27.8 27.8 27.8 ...
## $ smr_vsa2 : num 4.41 0 0 0 0 ...
## $ smr_vsa3 : num 3.98 5.24 8.43 8.43 8.43 ...
## $ smr_vsa4 : num 0 20.8 29.6 21.4 20.3 ...
## $ smr_vsa5 : num 113.2 70.6 235.1 235.1 234.6 ...
## $ smr_vsa6 : num 0 5.26 76.25 76.25 76.25 ...
## $ smr_vsa7 : num 66.2 33.3 0 31.3 0 ...
## $ tpsa.1 : num 16.6 49.3 51.7 38.6 38.6 ...
## $ logp.o.w. : num 2.948 0.889 4.439 5.254 3.8 ...
## $ frac.anion7. : num 0 0.001 0 0 0 0 0.001 0 0 0 ...
## $ frac.cation7. : num 0.999 0 0.986 0.986 0.986 0.986 0.996 0.946 0.999 0.976 ...
## $ andrewbind : num 3.4 -3.3 12.8 12.8 10.3 10 10.4 15.9 12.9 9.5 ...
## $ rotatablebonds : int 3 2 8 8 8 8 8 7 4 5 ...
## $ mlogp : num 2.5 1.06 4.66 3.82 3.27 ...
## $ clogp : num 2.97 0.494 5.137 5.878 4.367 ...
## $ mw : num 155 151 365 382 325 ...
## $ nocount : int 1 3 5 4 4 4 4 3 2 4 ...
## $ hbdnr : int 1 2 1 1 1 1 2 1 1 0 ...
## $ rule.of.5violations : int 0 0 1 1 0 0 0 0 1 0 ...
## $ alert : int 0 0 0 0 0 0 0 0 0 0 ...
## $ prx : int 0 1 6 2 2 2 1 0 0 4 ...
## $ ub : num 0 3 5.3 5.3 4.2 3.6 3 4.7 4.2 3 ...
## $ pol : int 0 2 3 3 2 2 2 3 4 1 ...
## $ inthb : int 0 0 0 0 0 0 1 0 0 0 ...
## $ adistm : num 0 395 1365 703 746 ...
## $ adistd : num 0 10.9 25.7 10 10.6 ...
## $ polar_area : num 21.1 117.4 82.1 65.1 66.2 ...
## $ nonpolar_area : num 379 248 638 668 602 ...
## $ psa_npsa : num 0.0557 0.4743 0.1287 0.0974 0.11 ...
## $ tcsa : num 0.0097 0.0134 0.0111 0.0108 0.0118 0.0111 0.0123 0.0099 0.0106 0.0115 ...
## $ tcpa : num 0.1842 0.0417 0.0972 0.1218 0.1186 ...
## $ tcnp : num 0.0103 0.0198 0.0125 0.0119 0.013 0.0125 0.0162 0.011 0.0109 0.0122 ...
## $ ovality : num 1.1 1.12 1.3 1.3 1.27 ...
## $ surface_area : num 400 365 720 733 668 ...
## $ volume : num 656 555 1224 1257 1133 ...
## $ most_negative_charge: num -0.617 -0.84 -0.801 -0.761 -0.857 ...
## $ most_positive_charge: num 0.307 0.497 0.541 0.48 0.455 ...
## $ sum_absolute_charge : num 3.89 4.89 7.98 7.93 7.85 ...
## $ dipole_moment : num 1.19 4.21 3.52 3.15 3.27 ...
## $ homo : num -9.67 -8.96 -8.63 -8.56 -8.67 ...
## $ lumo : num 3.4038 0.1942 0.0589 -0.2651 0.3149 ...
## $ hardness : num 6.54 4.58 4.34 4.15 4.49 ...
## $ ppsa1 : num 349 223 518 508 509 ...
## $ ppsa2 : num 679 546 2066 2013 1999 ...
## $ ppsa3 : num 31 42.3 64 61.7 61.6 ...
## $ pnsa1 : num 51.1 141.8 202 225.4 158.8 ...
## $ pnsa2 : num -99.3 -346.9 -805.9 -894 -623.3 ...
## $ pnsa3 : num -10.5 -44 -43.8 -42 -39.8 ...
## $ fpsa1 : num 0.872 0.611 0.719 0.692 0.762 ...
## $ fpsa2 : num 1.7 1.5 2.87 2.75 2.99 ...
## $ fpsa3 : num 0.0774 0.1159 0.0888 0.0842 0.0922 ...
## $ fnsa1 : num 0.128 0.389 0.281 0.307 0.238 ...
## $ fnsa2 : num -0.248 -0.951 -1.12 -1.22 -0.933 ...
## $ fnsa3 : num -0.0262 -0.1207 -0.0608 -0.0573 -0.0596 ...
## $ wpsa1 : num 139.7 81.4 372.7 372.1 340.1 ...
## $ wpsa2 : num 272 199 1487 1476 1335 ...
## $ wpsa3 : num 12.4 15.4 46 45.2 41.1 ...
## $ wnsa1 : num 20.4 51.8 145.4 165.3 106 ...
## $ wnsa2 : num -39.8 -126.6 -580.1 -655.3 -416.3 ...
## [list output truncated]
?BloodBrain
## starting httpd help server ... done
nzv <- nearZeroVar(bbbDescr)
length(nzv)
## [1] 7
nzv
## [1] 3 16 17 22 25 50 60
library(corrplot)
cor_matrix <- cor(bbbDescr, use = "complete.obs")
hi_corr_vars <- findCorrelation(cor_matrix, cutoff = 0.8)
colnames(bbbDescr)[hi_corr_vars]
## [1] "vsa_don" "slogp_vsa7" "smr_vsa5"
## [4] "mw" "nocount" "hbdnr"
## [7] "ub" "nonpolar_area" "tcnp"
## [10] "surface_area" "volume" "ppsa2"
## [13] "ppsa3" "pnsa2" "fnsa3"
## [16] "wpsa1" "wpsa2" "wpsa3"
## [19] "wnsa1" "wnsa2" "wnsa3"
## [22] "dpsa1" "dpsa2" "dpsa3"
## [25] "sadh1" "sadh3" "chdh1"
## [28] "chdh3" "scdh1" "scdh2"
## [31] "scdh3" "saaa1" "saaa3"
## [34] "scaa1" "scaa2" "scaa3"
## [37] "ctdh" "ctaa" "mchg"
## [40] "vsa_hyd" "a_acid" "slogp_vsa3"
## [43] "tpsa" "weight" "logp.o.w."
## [46] "tpsa.1" "adistm" "polar_area"
## [49] "psa_npsa" "sum_absolute_charge" "fpsa1"
## [52] "fpsa2" "ppsa1" "chaa1"
## [55] "pnsa3" "chdh2" "vsa_acc"
a sample. In their procedure, they used a gas chromatograph (an instrument that separate chemicals in a sample) to measure seven different fatty acids in an oil. These measurements would then be used to predict the type of oil in a food samples. To create their model, they used 96 samples2 of seven types of oils. These data can be found in the caret package using data(oil). The oil types are contained in a factor variable called oilType. The types are pumpkin (coded as A), sunflower (B), peanut (C), olive (D), soybean (E), rapeseed (F) and corn (G). In R,
library(caret)
data(oil)
str(oilType)
## Factor w/ 7 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
table(oilType)
## oilType
## A B C D E F G
## 37 26 3 7 11 10 2
# Population frequency of oil types:
print( table(oilType)/length(oilType) )
## oilType
## A B C D E F G
## 0.38541667 0.27083333 0.03125000 0.07291667 0.11458333 0.10416667 0.02083333
# Using the sample function in R to draw a completely random sample of 60 oils for 30 times:
#
set.seed(12345)
oilsamples <- vector(mode="list",length = 30)
for ( i in seq(along = oilsamples)){
oilsamples[[i]] <- table(sample(oilType, size=60))
}
head (oilsamples,5)
## [[1]]
##
## A B C D E F G
## 24 20 1 4 5 4 2
##
## [[2]]
##
## A B C D E F G
## 23 17 1 4 7 8 0
##
## [[3]]
##
## A B C D E F G
## 27 14 2 4 6 6 1
##
## [[4]]
##
## A B C D E F G
## 25 17 0 4 6 7 1
##
## [[5]]
##
## A B C D E F G
## 21 20 2 4 6 6 1
oilsamples <- do.call("rbind",oilsamples)
head (oilsamples,5)
## A B C D E F G
## [1,] 24 20 1 4 5 4 2
## [2,] 23 17 1 4 7 8 0
## [3,] 27 14 2 4 6 6 1
## [4,] 25 17 0 4 6 7 1
## [5,] 21 20 2 4 6 6 1
# How do its frequencies compare with that of the population:
summary(oilsamples/60)
## A B C D
## Min. :0.3167 Min. :0.2167 Min. :0.00000 Min. :0.03333
## 1st Qu.:0.3500 1st Qu.:0.2708 1st Qu.:0.03333 1st Qu.:0.06667
## Median :0.3833 Median :0.2833 Median :0.03333 Median :0.06667
## Mean :0.3783 Mean :0.2844 Mean :0.03111 Mean :0.07500
## 3rd Qu.:0.3958 3rd Qu.:0.3125 3rd Qu.:0.03333 3rd Qu.:0.08333
## Max. :0.4500 Max. :0.3333 Max. :0.05000 Max. :0.11667
## E F G
## Min. :0.06667 Min. :0.03333 Min. :0.00000
## 1st Qu.:0.10000 1st Qu.:0.08333 1st Qu.:0.01667
## Median :0.10833 Median :0.10000 Median :0.01667
## Mean :0.11556 Mean :0.09833 Mean :0.01722
## 3rd Qu.:0.13333 3rd Qu.:0.11667 3rd Qu.:0.02917
## Max. :0.16667 Max. :0.13333 Max. :0.03333
set.seed(12345)
oilsamples2 <- createDataPartition(oilType, p = 0.60, times=20)
oilsamples2 <- lapply(oilsamples2, function(x, y) table(y[x]), y = oilType)
head (oilsamples2,5)
## $Resample01
##
## A B C D E F G
## 23 16 2 5 7 6 2
##
## $Resample02
##
## A B C D E F G
## 23 16 2 5 7 6 2
##
## $Resample03
##
## A B C D E F G
## 23 16 2 5 7 6 2
##
## $Resample04
##
## A B C D E F G
## 23 16 2 5 7 6 2
##
## $Resample05
##
## A B C D E F G
## 23 16 2 5 7 6 2
oilsamples2 <- do.call("rbind",oilsamples2)
head (oilsamples2,5)
## A B C D E F G
## Resample01 23 16 2 5 7 6 2
## Resample02 23 16 2 5 7 6 2
## Resample03 23 16 2 5 7 6 2
## Resample04 23 16 2 5 7 6 2
## Resample05 23 16 2 5 7 6 2
# How do its frequencies compare with that of the population:
summary(oilsamples2/60)
## A B C D
## Min. :0.3833 Min. :0.2667 Min. :0.03333 Min. :0.08333
## 1st Qu.:0.3833 1st Qu.:0.2667 1st Qu.:0.03333 1st Qu.:0.08333
## Median :0.3833 Median :0.2667 Median :0.03333 Median :0.08333
## Mean :0.3833 Mean :0.2667 Mean :0.03333 Mean :0.08333
## 3rd Qu.:0.3833 3rd Qu.:0.2667 3rd Qu.:0.03333 3rd Qu.:0.08333
## Max. :0.3833 Max. :0.2667 Max. :0.03333 Max. :0.08333
## E F G
## Min. :0.1167 Min. :0.1 Min. :0.03333
## 1st Qu.:0.1167 1st Qu.:0.1 1st Qu.:0.03333
## Median :0.1167 Median :0.1 Median :0.03333
## Mean :0.1167 Mean :0.1 Mean :0.03333
## 3rd Qu.:0.1167 3rd Qu.:0.1 3rd Qu.:0.03333
## Max. :0.1167 Max. :0.1 Max. :0.03333
binom.test(16, 20)
##
## Exact binomial test
##
## data: 16 and 20
## number of successes = 16, number of trials = 20, p-value = 0.01182
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.563386 0.942666
## sample estimates:
## probability of success
## 0.8
nbr_success = 16:20
ci_width = c()
for( nbrs in nbr_success ){
bt_out = binom.test( nbrs, 20 )
ci_width = c( ci_width, diff( bt_out$conf.int ) )
}
plot( nbr_success, ci_width, type='l', xlab='Number of correct samples (from 20)', ylab='95% confidence interval width')
- As we increase the sample size, the diagram illustrates that the width
of the confidence interval decreases, primarily due to the reduction in
the standard error. Consequently, increasing the sample size during
model building tends to improve model accuracy by reducing variability.
(Keep in mind that your answers may vary)
In addition, we could use a sketch of typical (squared) bias, variance, training error, test error on a single plot, as we go from less flexible statistical/predictive learning methods towards more flexible procedures. In the following plot, the x-axis represents the amount of flexibility in the method, and the y-axis represents the values for each curve. We observe that (i): (squared) bias - decreases monotonically since increases in flexibility result in a closer fit. (ii) variance - increases monotonically since increases in flexibility result in overfit. (iii) training error - decreases monotonically since increases in flexibility result in a closer fit. (iv) test error - concave up curve since increases in flexibility result in a closer fit before it overfits.
knitr::include_graphics("Homework1_Key_2024.jpeg")