1 Assignment: Section 7: Comprehensive application of data preprocessing

  在数据挖掘过程中,高维数据是非常棘手的研究对象。特别是在文本挖掘、图像处理和基因数据分析中,维度过高使很多学习器无法工作或效率降低,所以降维也是数据预处理过程的一项必要任务。降维大致有两大类别,一类是从原始维度中提取新的维度,例如主成分分析或因子分析,再或者是奇异值分解或是多维标度分析。另一类是从原始维度中选择一些子集,即称为特征选择(Feature Selection),或者叫作最佳子集选择。特征选择本质上继承了Occam’s razor的思想,从一组特征中选出一些最有效的特征,使构造出来的模型更好。

  进行特征选择的好处在于:

  主要方法有三种:

1.1 Load the data and library

rm(list =ls())
library(AppliedPredictiveModeling)
library(caret, warn.conflicts = F)
## Loading required package: lattice
## Loading required package: ggplot2
library(MASS, warn.conflicts = F)
library(e1071, warn.conflicts = F)
library(randomForest, warn.conflicts = F)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
data("segmentationOriginal")
# head(segmentationOriginal)

1.2 Delete junk column

length(segmentationOriginal[,1])
## [1] 2019
seg = subset(segmentationOriginal, Case == "Train")
segtest = subset(segmentationOriginal, Case == "Test")
length(seg[,1])
## [1] 1009
length(segtest[,1])
## [1] 1010
dim(seg)
## [1] 1009  119
yclass = seg$Class
xseg = seg[, -(1:3)]

1.3 Unsupervised filter

Delete variables with low variance (near to zero).

nearZeroVar(xseg)
## [1] 68 73 74
xseg = xseg[, -nearZeroVar(xseg)]

Delete variables with high correlation.

statuscolumn = grep("Status", names(xseg))   #Screening dichotomous qualitative variables
xseg1 = xseg[, -statuscolumn]
xsegstatus = xseg[,statuscolumn]
correlations = cor(xseg1)
dim(correlations)
## [1] 58 58
highcor = findCorrelation(correlations, 0.75)
length(highcor)
## [1] 32
xsegnum = xseg1[, -highcor]
xsegdata = cbind(xsegnum, xsegstatus)
colnames(xsegdata)
##  [1] "AngleCh1"                      "ConvexHullPerimRatioCh1"      
##  [3] "EntropyIntenCh1"               "FiberAlign2Ch3"               
##  [5] "FiberAlign2Ch4"                "FiberWidthCh1"                
##  [7] "IntenCoocASMCh3"               "IntenCoocASMCh4"              
##  [9] "IntenCoocContrastCh3"          "IntenCoocContrastCh4"         
## [11] "KurtIntenCh1"                  "KurtIntenCh3"                 
## [13] "KurtIntenCh4"                  "NeighborAvgDistCh1"           
## [15] "NeighborMinDistCh1"            "ShapeBFRCh1"                  
## [17] "ShapeLWRCh1"                   "SpotFiberCountCh3"            
## [19] "SpotFiberCountCh4"             "TotalIntenCh2"                
## [21] "VarIntenCh1"                   "VarIntenCh3"                  
## [23] "VarIntenCh4"                   "WidthCh1"                     
## [25] "XCentroid"                     "YCentroid"                    
## [27] "AngleStatusCh1"                "AreaStatusCh1"                
## [29] "AvgIntenStatusCh1"             "AvgIntenStatusCh2"            
## [31] "AvgIntenStatusCh3"             "AvgIntenStatusCh4"            
## [33] "ConvexHullAreaRatioStatusCh1"  "ConvexHullPerimRatioStatusCh1"
## [35] "DiffIntenDensityStatusCh1"     "DiffIntenDensityStatusCh3"    
## [37] "DiffIntenDensityStatusCh4"     "EntropyIntenStatusCh1"        
## [39] "EntropyIntenStatusCh3"         "EntropyIntenStatusCh4"        
## [41] "EqCircDiamStatusCh1"           "EqEllipseLWRStatusCh1"        
## [43] "EqEllipseOblateVolStatusCh1"   "EqEllipseProlateVolStatusCh1" 
## [45] "EqSphereAreaStatusCh1"         "EqSphereVolStatusCh1"         
## [47] "FiberAlign2StatusCh3"          "FiberAlign2StatusCh4"         
## [49] "FiberLengthStatusCh1"          "FiberWidthStatusCh1"          
## [51] "IntenCoocASMStatusCh3"         "IntenCoocASMStatusCh4"        
## [53] "IntenCoocContrastStatusCh3"    "IntenCoocContrastStatusCh4"   
## [55] "IntenCoocEntropyStatusCh3"     "IntenCoocEntropyStatusCh4"    
## [57] "IntenCoocMaxStatusCh3"         "IntenCoocMaxStatusCh4"        
## [59] "KurtIntenStatusCh3"            "KurtIntenStatusCh4"           
## [61] "LengthStatusCh1"               "NeighborAvgDistStatusCh1"     
## [63] "NeighborMinDistStatusCh1"      "NeighborVarDistStatusCh1"     
## [65] "PerimStatusCh1"                "ShapeBFRStatusCh1"            
## [67] "ShapeLWRStatusCh1"             "ShapeP2AStatusCh1"            
## [69] "SkewIntenStatusCh1"            "SkewIntenStatusCh3"           
## [71] "SkewIntenStatusCh4"            "SpotFiberCountStatusCh3"      
## [73] "SpotFiberCountStatusCh4"       "TotalIntenStatusCh1"          
## [75] "TotalIntenStatusCh2"           "TotalIntenStatusCh3"          
## [77] "TotalIntenStatusCh4"           "VarIntenStatusCh1"            
## [79] "VarIntenStatusCh3"             "VarIntenStatusCh4"            
## [81] "WidthStatusCh1"

1.4 Supervised filter

Delete variables with low correlation with dependent variable.

Write a function to inspect the correlation between independent and dependent variable.

pScore = function(x, y){
  numX = length(unique(x))
  if(numX > 2){
    out = t.test(x~y)$p.value
  }else{
    out = fisher.test(factor(x),y)$p.value
  }
  out
}

Write a function to calculate the correlation coefficient.

cal = function(x){
  numX = length(unique(x))
  numX
}

Write a function to adjust and filtrate p value.

pCorrection = function(score, p0){
  score = p.adjust(score, "bonferroni")
  keepers = (score <= p0)
  keepers
}

Filter data with functions.

scores = apply(X = xsegdata, MARGIN = 2, FUN = pScore, y = yclass)
tail(scores)
## TotalIntenStatusCh3 TotalIntenStatusCh4   VarIntenStatusCh1   VarIntenStatusCh3 
##        6.957262e-01        1.114685e-05        3.173506e-21        9.998495e-02 
##   VarIntenStatusCh4      WidthStatusCh1 
##        3.174527e-08        4.972084e-06
result1 = pCorrection(scores, 0.05)
colnames(xsegdata)[result1]
##  [1] "ConvexHullPerimRatioCh1"    "EntropyIntenCh1"           
##  [3] "FiberAlign2Ch3"             "FiberWidthCh1"             
##  [5] "IntenCoocASMCh3"            "IntenCoocASMCh4"           
##  [7] "IntenCoocContrastCh3"       "KurtIntenCh4"              
##  [9] "ShapeBFRCh1"                "ShapeLWRCh1"               
## [11] "TotalIntenCh2"              "VarIntenCh1"               
## [13] "VarIntenCh4"                "AvgIntenStatusCh1"         
## [15] "AvgIntenStatusCh2"          "DiffIntenDensityStatusCh1" 
## [17] "EntropyIntenStatusCh3"      "IntenCoocASMStatusCh3"     
## [19] "IntenCoocContrastStatusCh3" "IntenCoocEntropyStatusCh3" 
## [21] "IntenCoocMaxStatusCh3"      "SkewIntenStatusCh1"        
## [23] "TotalIntenStatusCh1"        "TotalIntenStatusCh2"       
## [25] "TotalIntenStatusCh4"        "VarIntenStatusCh1"         
## [27] "VarIntenStatusCh4"          "WidthStatusCh1"
xsegfilted = xsegdata[,result1]
dim(xsegfilted)
## [1] 1009   28
xyfilted = cbind(xsegfilted, yclass)

1.5 Wrapper

GLM ==> Stepwise regression

initial = glm(yclass~., data = xyfilted, family = binomial)
resultstep = stepAIC(initial, direction = "both")
resultstep$call

  封装法采用逐步回归的方法,通过每一步比较AIC的值,筛选变量,直到AIC不再减小。最后筛选出的14个变量为:(AIC为853.82

  • ConvexHullPerimRatioCh1
  • FiberWidthCh1
  • IntenCoocASMCh3
  • IntenCoocASMCh4
  • IntenCoocContrastCh3
  • TotalIntenCh2
  • VarIntenCh1
  • VarIntenCh4
  • AvgIntenStatusCh1
  • IntenCoocASMStatusCh3
  • IntenCoocMaxStatusCh3
  • SkewIntenStatusCh1
  • TotalIntenStatusCh2
  • VarIntenStatusCh1

图1:筛选过程

1.6 Method Compare

Comparison of different build-in methods

dim(seg)
## [1] 1009  119

Prepare the training set and test set

# total = segmentationOriginal[,-c(1:3,71,76,77)]
# Case = segmentationOriginal[,2]
# statuscolumn = grep("Status", names(total))
# total_1 = total[, -statuscolumn]
# total_2 = total[,statuscolumn]
# correlations_0 = cor(total_1)
# highcor_0 = findCorrelation(correlations_0, 0.75)
# total_3 = total_1[, -highcor_0]
# total_4 = cbind(total_3, total_2, Case)
# total_4 = cbind(total_1, total_2, Case)
# xtrain = subset(total_4, Case == "Train")
# xtrain = xtrain[,1:81]
# ytrain = seg[,3]
# train = cbind(xtrain, ytrain)
# xtest = subset(total_4, Case == "Test")
# xtest = xtest[,1:81]
# ytest = segtest[,3]
# test = cbind(xtest, ytest)
xtrain = seg[,-c(1:3,71,76,77)]
ytrain = seg[,3]
train = cbind(xtrain,ytrain)

xtest = segtest[,-c(1:3,71,76,77)]
ytest = segtest[,3]
test = cbind(xtest,ytest)

1.6.1 SBF-LDA

ldfCtrl = sbfControl(method = "repeatedcv", repeats = 5, functions = ldaSBF, verbose = F)
t1 = Sys.time()
ldaFilter = sbf(xtrain, ytrain, tol = 1.0e-12, sbfControl = ldfCtrl)
t2 = Sys.time()
t2 - t1
ldaFilter
##Time = 10.67644s

RESULT

# Time difference of 16.12887 secs
# 
# Selection By Filter
# 
# Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 
# 
# Resampling performance:
# 
# 
# Using the training set, 71 variables were selected:
#    AvgIntenCh1, AvgIntenCh2, AvgIntenCh4, AvgIntenStatusCh1, AvgIntenStatusCh2...
# 
# During resampling, the top 5 selected variables (out of a possible 84):
#    AvgIntenCh1 (100%), AvgIntenCh2 (100%), AvgIntenCh4 (100%), AvgIntenStatusCh1 (100%), AvgIntenStatusCh2 (100%)
# 
# On average, 70.4 variables were selected (min = 66, max = 76)
Accuracy Kappa AccuracySD KappaSD
0.8051 0.5781 0.04489 0.09552

  SBF-LDA筛选出的前五个变量为:

  • AvgIntenCh1
  • AvgIntenCh2
  • AvgIntenCh4
  • AvgIntenStatusCh1
  • AvgIntenStatusCh2

1.6.2 SBF-RF

rffCtrl = sbfControl(method = "repeatedcv", repeats = 5, functions = rfSBF, verbose = F)
t1 = Sys.time()
rfFilter = sbf(xtrain, ytrain, sbfControl = rffCtrl)
t2 = Sys.time()
t2 - t1
rfFilter
# Time = 1.171011min

RESULT

# Time difference of 1.976484 mins
# 
# Selection By Filter
# 
# Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 
# 
# Resampling performance:
# 
# 
# Using the training set, 71 variables were selected:
#    AvgIntenCh1, AvgIntenCh2, AvgIntenCh4, AvgIntenStatusCh1, AvgIntenStatusCh2...
# 
# During resampling, the top 5 selected variables (out of a possible 85):
#    AvgIntenCh1 (100%), AvgIntenCh2 (100%), AvgIntenCh4 (100%), AvgIntenStatusCh1 (100%), AvgIntenStatusCh2 (100%)
# 
# On average, 70.2 variables were selected (min = 67, max = 76)
Accuracy Kappa AccuracySD KappaSD
0.8303 0.6346 0.03511 0.07424

  SBF-RF筛选出的前五个变量为:

  • AvgIntenCh1
  • AvgIntenCh2
  • AvgIntenCh4
  • AvgIntenStatusCh1
  • AvgIntenStatusCh2

  两种方法得到的Top5重要变量一致!

1.6.3 RFE-LDA

ldrctrl = rfeControl(method = "repeatedcv", repeats = 5, verbose = F, functions = ldaFuncs)
set.seed(2)
t1 = Sys.time()
ldaRFE = rfe(xtrain, ytrain, metric = "ROC", rfeControl = ldrctrl)
t2 = Sys.time()
t2 - t1
ldaRFE
# Time = 5.775458s

RESULT

# Time difference of 9.768444 secs
# 
# Recursive feature selection
# 
# Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 
# 
# Resampling performance over subset size:
# 
# 
# The top 5 variables (out of 16):
#    FiberWidthCh1, TotalIntenCh2, ShapeP2ACh1, TotalIntenCh1, AvgIntenCh2
Variables Accuracy Kappa AccuracySD KappaSD Selected
4 0.7656 0.4827 0.03458 0.07680
8 0.7889 0.5338 0.03406 0.07880
16 0.7985 0.5561 0.03471 0.07761 *
113 0.7959 0.5580 0.03323 0.07374

  RFE-LDA筛选出的前五个变量为:

  • FiberWidthCh1
  • TotalIntenCh2
  • ShapeP2ACh1
  • TotalIntenCh1
  • AvgIntenCh2

1.6.4 RFE-RF

rfrctrl = rfeControl(method = "repeatedcv", repeats = 5, verbose = F, functions = rfFuncs)
set.seed(100)
t1 = Sys.time()
rfRFE = rfe(xtrain, ytrain, metric = "ROC", rfeControl = rfrctrl)
t2 = Sys.time()
t2 - t1
rfRFE
#Time = 4.640548min

RESULT

# Time difference of 6.229533 mins
# 
# Recursive feature selection
# 
# Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 
# 
# Resampling performance over subset size:
# 
# 
# The top 5 variables (out of 113):
#    TotalIntenCh2, FiberWidthCh1, TotalIntenCh1, ShapeP2ACh1, AvgIntenCh2
Variables Accuracy Kappa AccuracySD KappaSD Selected
4 0.8092 0.5906 0.03752 0.07831
8 0.8268 0.6254 0.04262 0.09266
16 0.8251 0.6239 0.04146 0.08973
113 0.8292 0.6317 0.03707 0.08070 *

  RFE-RF筛选出的前五个变量为:

  • TotalIntenCh2
  • FiberWidthCh1
  • TotalIntenCh1
  • AvgIntenCh2
  • ShapeP2ACh1

1.7 Results comparasion

Variables Accuracy Kappa AccuracySD KappaSD Selected
4 0.8090 0.5881 0.04124 0.08965
8 0.8236 0.6229 0.03591 0.07713
16 0.8217 0.6178 0.03867 0.08419
81 0.8244 0.6233 0.03865 0.08460 *

   RFE的LDA和RF方法筛选得到的Top5重要变量中也都保持一致,但排序不一致;同时,比较两种方法(SBF和RFE),两者得到的结果匹配度较低(仅一个变量:AvgIntenCh2 保持一致),这也体现了不同函数的评价体系或过程不同:

  1. 预测变量方面,SBF方法由于使用规则对变量进行过滤,得到的变量较多,RFE得到的变量较少
  2. 预测稳定性方面,四种方法交叉验证的准确率以及方差差距不大,SBF-RF准确率最高,SBF-LDA预测方差较小
  3. 训练时间方面,SBF训练时间远小于RFE,这与RFE过滤方法选择过程独立于训练过程有关,因此RFE有可能删除有用的特征

   同时,也发现:最后运行时间的排序和书中存在出入,直观来看,RF的运行时间的确应该比LDA长,因为RF是集成学习算法,需要训练多个“决策树模型”,从这个角度来看,实际运行结果是合理的性(可能书上结论存在印刷错误!),因此用LDA训练的时间会明显小于RF。

   问题:LDA方法存在多重共线性的现象:书本中的代码没有剔除掉高度相关的变量,我尝试将其剔除后,SBF-LDA能够正常运行,而RFE-LDA仍显示存在多重共线性(暂时未能解决)。

2 Section 1: Tidy data

This data set (“dirty_iris.csv”) is downloaded from Github.

2.1 Import library

rm(list=ls())
library(ggplot2)
library(stringr)
library(reshape2)
library(dplyr, warn.conflicts = F)

2.2 Import data

data = read.csv(file = 'dirty_iris.csv', header = TRUE, stringsAsFactors = FALSE)
tail(data)

2.3 Tidy the data

Sort data by “Species、Sepal.Length、Petal.Length” orderly and convert the type of attribute “Species” from string to factor.

tidy = arrange(data, Species, Sepal.Length, Petal.Length)
head(tidy)
data$Species = factor(data$Species)

3 Section 2: Data Management and cleaning

3.1 Import data

Merge the data from files that begin with “dirty”.

files = dir(pattern = '^dirty')
files
## [1] "dirty_iris.csv"  "dirty_iris2.csv"
data_List = lapply(files, read.csv, stringsAsFactor = FALSE)
data_mixed = do.call(rbind, data_List)
head(data_mixed)

Check the basic information of the dataset.

str(data_mixed)
## 'data.frame':    300 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width : num  3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
##  $ Petal.Length: num  4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
##  $ Petal.Width : num  1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
##  $ Species     : chr  "versicolor" "virginica" "virginica" "setosa" ...

Use function “tbl_df()”(equal to tibble::as_tibble()) to make the output frame looks more friendly and easy to check.

data.df = tibble::as_tibble(data)
data.df

3.2 Basic data operation

Function “mutate()” ==> To add a new column

data.temp = mutate(data, rounded_Sepal.Length = round(Sepal.Length))
head(data.temp)
data.s = select(data.temp, rounded_Sepal.Length, Sepal.Length:Petal.Width,Species)
head(data.s)
data.s = select(data.s, -rounded_Sepal.Length)
head(data.s)

Function “filter” ==> To select row that satisfy the conditon.

MA_data = filter(data.s, Sepal.Length > 5 & Petal.Length < 4)
MA_data[1:5,]

Function “Arrange” ==> To rank.

arrange(data.s, Sepal.Length, Sepal.Width )[1:10,]

Function “summarise”&“group_by” ==> To statistical correlation statistics.

summarise(group_by(data, Species),
          avg.Sepal.L = mean(Sepal.Length, na.rm = TRUE),
          sd = sd(Sepal.Length, na.rm = TRUE),
          size = sum(! is.na(Sepal.Length)),
          se = sd/sqrt(size),
          conf.upper = avg.Sepal.L + qt(0.975, size-1)*se,
          conf.lower = avg.Sepal.L - qt(0.975, size-1)*se)
## `summarise()` ungrouping output (override with `.groups` argument)

4 Section3: Transformation of data

4.1 Min-Max Normalization

library(caret, warn.conflicts = F)
trans = preProcess(select(data, Petal.Length), method = c("range"))
# method = "range" ==> Min-Max Normalization  (0,1)
trans
## Created from 131 samples and 1 variables
## 
## Pre-processing:
##   - ignored (0)
##   - re-scaling to [0, 1] (1)
transformed = predict(trans, select(data, Petal.Length))
head(transformed[,1])
## [1] 0.07142857 0.09523810 0.08571429 0.02539683 0.05555556         NA

4.2 Standardization

trans2 = preProcess(select(data, Petal.Length), method = c("center", "scale"))
# method = c("center", "scale" ) ==> Normalization  (-1,1)
transformed2 = predict(trans2, select(data, Petal.Length))
head(transformed2[,1])
## [1]  0.00867318  0.26867012  0.16467134 -0.49398756 -0.16465811          NA

5 Section 4: Missing value processing

5.1 Identify missing value

Check the first 10 value of Sepal.Length in the data to judge if there are some missing value.

head(is.na(data$Sepal.Length), 10)
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

To count the missing value of attribute “Sepal.Length”.

sum(is.na(data$Sepal.Length))
## [1] 10

To count the non_missing value of attribute “Sepal.Length”.

sum(! is.na(data$Sepal.Length))
## [1] 140

If there exists missing value in any attribute in one record, then (“complete.cases()”) return TRUE.

complete.cases(data)[1:10]
##  [1]  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
data.complete = data[complete.cases(data),]
length(data.complete)
## [1] 5

Show the missing value in “List”

library(mice, warn.conflicts = F)
md.pattern(data)

##    Species Sepal.Length Petal.Width Sepal.Width Petal.Length   
## 96       1            1           1           1            1  0
## 15       1            1           1           1            0  1
## 15       1            1           1           0            1  1
## 2        1            1           1           0            0  2
## 11       1            1           0           1            1  1
## 1        1            1           0           1            0  2
## 9        1            0           1           1            1  1
## 1        1            0           1           1            0  2
##          0           10          12          17           19 58

Show the missing value in “Figure”

library(VIM, warn.conflicts = F)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
aggr(data, prop=F, numbers = T)

## Missing value interpolation

Method 1: Delete data with missing value, if the size is small (always lower than 10%).

data("airquality")
L = length(airquality[,1])
any(is.na(airquality))
## [1] TRUE
airquality[complete.cases(airquality),][1:10,]
count = sum(! complete.cases(airquality))
rate = count/L
rate
## [1] 0.2745098

The size of data with missing value is over 10% (27.45%), so the method is not reasonable.

Method 2: Pairwise deletion

apply(airquality, 2, mean, na.rm = TRUE)
##      Ozone    Solar.R       Wind       Temp      Month        Day 
##  42.129310 185.931507   9.957516  77.882353   6.993464  15.803922
cor(airquality, use="pair")
##               Ozone     Solar.R        Wind       Temp        Month
## Ozone    1.00000000  0.34834169 -0.60154653  0.6983603  0.164519314
## Solar.R  0.34834169  1.00000000 -0.05679167  0.2758403 -0.075300764
## Wind    -0.60154653 -0.05679167  1.00000000 -0.4579879 -0.178292579
## Temp     0.69836034  0.27584027 -0.45798788  1.0000000  0.420947252
## Month    0.16451931 -0.07530076 -0.17829258  0.4209473  1.000000000
## Day     -0.01322565 -0.15027498  0.02718090 -0.1305932 -0.007961763
##                  Day
## Ozone   -0.013225647
## Solar.R -0.150274979
## Wind     0.027180903
## Temp    -0.130593175
## Month   -0.007961763
## Day      1.000000000

Method 3: Replaced by the value with highest frequency.

mean6 = apply(airquality,2,mean, na.rm=T)
mean6
##      Ozone    Solar.R       Wind       Temp      Month        Day 
##  42.129310 185.931507   9.957516  77.882353   6.993464  15.803922

Add one column to mark the incomplete records.

airquality$col = c("Mean_imputation", "notNA")[complete.cases(airquality[,1:2]) + 1]
View(airquality)

To identify if there exists missing value in every attribute.

for(i in 1:6){
  print(any(is.na(airquality[,i])))
}
## [1] TRUE
## [1] TRUE
## [1] FALSE
## [1] FALSE
## [1] FALSE
## [1] FALSE

Interpolate the missing value.

airquality[is.na(airquality$Ozone),"Ozone"] = mean6["Ozone"]
airquality[is.na(airquality$Solar.R),"Solar.R"] = mean6["Solar.R"]
any(is.na(airquality))
## [1] FALSE

Draw the interpolated histogram.

ggplot(airquality, aes(Ozone, fill = col)) + geom_histogram(alpha = 0.5, position = "identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Draw the interpolated scatter figure.

ggplot(airquality, aes(x = Solar.R, y = Ozone, color = col)) + geom_point(size = 4)

Statistical the information after interpolation.

sd(airquality$Ozone)
## [1] 28.69337
cor(airquality$Ozone, airquality$Solar.R)
## [1] 0.3029695

Statistical the information before interpolation.

#Reload the data set.
data("airquality")
sd(airquality$Ozone, na.rm = T)
## [1] 32.98788
cor(airquality$Ozone, airquality$Solar.R, use = "complete.obs")
## [1] 0.3483417

Method4 : Regression imputation Add one column to identify the data with missing value.

data("airquality")
airquality$col = c("regression_imputation","notNA")[as.vector(! is.na(airquality["Ozone"]))+1]
fit = lm(Ozone~Solar.R, data = airquality)
a = which(! complete.cases(airquality))
airquality$Ozone[a] = as.vector(predict(fit, newdata = airquality[! complete.cases(airquality),]))
# new_data ==> data for predict
ggplot(airquality[complete.cases(airquality),], aes(Ozone, fill =col)) + 
  geom_histogram(alpha = 0.5, position = "identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(airquality[complete.cases(airquality),], aes(x = Solar.R, y = Ozone, color = col)) +
  geom_point(size = 4)

sd(airquality$Ozone, na.rm = T)
## [1] 29.48287
cor(airquality$Ozone, airquality$Solar.R, use = "complete.obs")
## [1] 0.3884392
data("airquality")
sd(airquality$Ozone, na.rm = T)
## [1] 32.98788
cor(airquality$Ozone, airquality$Solar.R, use = "complete.obs")
## [1] 0.3483417

Method 5: Stochastic regression imputation

library(mice, warn.conflicts = F)
imp = mice(airquality[,1:2], method = "norm.nob", m = 1, maxit = 1, seed = 1) # maxit ?
## 
##  iter imp variable
##   1   1  Ozone  Solar.R
air = complete(imp)
air$col = c("imputation", "notNA")[complete.cases(airquality[,1:2]) +1]
ggplot(air, aes(Ozone, fill = col)) + 
  geom_histogram(alpha = 0.5, position = "identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(air, aes(x = Solar.R, y = Ozone, color = col))+ 
  geom_point(size = 4)

Method 6: Multiple imputation

data("airquality")
imp = mice(airquality, seed = 1, print = F)
fit = with(imp, lm(Ozone~Wind+Temp+Solar.R))
pooled = pool(fit)
round(summary(pooled)[,2:6],3)[,c(1:3,5)]
fit.r = lm(Ozone~Wind+Temp+Solar.R, data=airquality)
round(coef(summary(fit.r)),3)
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -64.342     23.055  -2.791    0.006
## Wind          -3.334      0.654  -5.094    0.000
## Temp           1.652      0.254   6.516    0.000
## Solar.R        0.060      0.023   2.580    0.011
imp$imp$Ozone[1:10,]
air = complete(imp)
air$col = c("imputation", "notNA")[complete.cases(airquality[,1:2]) +1]
ggplot(air, aes(Ozone, fill = col)) + 
  geom_histogram(alpha = 0.5, position = "identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(air, aes(x = Solar.R, y = Ozone, color = col))+ 
  geom_point(size = 4)

Knn

library(DMwR, warn.conflicts = F)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(ggplot2)
data("airquality")
air = knnImputation(airquality, k=10)
air$col = c("imputation", "notNA")[complete.cases(airquality[,1:2]) +1]
ggplot(air, aes(Ozone, fill = col)) + 
  geom_histogram(alpha = 0.5, position = "identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(air, aes(x = Solar.R, y = Ozone, color = col))+ 
  geom_point(size = 4) 

6 Section 5: Detection of outliers

6.1 Detection of outliers of univariate and multivariate variables

set.seed(3147)
x = rnorm(100)
boxplot.stats(x)$out
## [1] -3.315391  2.685922 -3.055717  2.571203
boxplot(x)

6.2 Detection of local outliers

LOF is based on the density.

iris2 = iris[, 1:4]
outliers.scores = lofactor(iris2, k=5)   # k ==> Density boundaries
outliers = order(outliers.scores, decreasing = T)[1:5]
View(outliers.scores)
outliers
## [1]  42 107  23 110  63
par(mfrow = c(1,2))
plot(density(outliers.scores))
n = nrow(iris2)
labels = 1:n
labels[- outliers] = "."
biplot(prcomp(iris2), cex = .8, xlabs = labels)

7 Section 6: Variable selection

7.1 Filter

  • Delete variables with high level of missing value
  • Delete variables with low variance (always near to zero)
  • Delete variables with strong correlation.
  • Filter by variable clustering
library(AppliedPredictiveModeling)
library(caret, warn.conflicts = F)
library(corrplot, warn.conflicts = F)
## corrplot 0.84 loaded
data("segmentationOriginal")
segData = subset(segmentationOriginal, Case == "Train")
dim(segData)
## [1] 1009  119
segData = segData[, -(1:3)]
statusColNum = grep("Status", names(segData))
segData = segData[, -statusColNum]
correlations = cor(segData)
dim(correlations)
## [1] 58 58
correlations[1:4,1:4]
##                 AngleCh1      AreaCh1 AvgIntenCh1 AvgIntenCh2
## AngleCh1     1.000000000 -0.002627172 -0.04300776 -0.01944681
## AreaCh1     -0.002627172  1.000000000 -0.02529739 -0.15330301
## AvgIntenCh1 -0.043007757 -0.025297394  1.00000000  0.52521711
## AvgIntenCh2 -0.019446810 -0.153303007  0.52521711  1.00000000
corrplot(correlations, order = "hclust", tl.cex = 0.7)

highCorr = findCorrelation(correlations, cutoff = .75)
length(highCorr)
## [1] 32
filteredSegData = segData[,-highCorr]
library(Hmisc)
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following object is masked from 'package:e1071':
## 
##     impute
## The following objects are masked from 'package:base':
## 
##     format.pval, units
v = varclus(as.matrix(segData))
# print(round(v$sim,2))
plot(v)

nvars = 30
tree = cutree(v$hclust, nvars)
tab = table(tree)
predictors.select = rep(0,30)
for(i in 1:nvars){
  if(sum(tree == i) == 1) predictors.select[i] = names(tree[tree == i])
  else predictors.select[i] = names(sample(tree[tree == i],1))
}
predictors.select
##  [1] "AngleCh1"               "AreaCh1"                "DiffIntenDensityCh1"   
##  [4] "TotalIntenCh2"          "DiffIntenDensityCh3"    "TotalIntenCh4"         
##  [7] "ConvexHullAreaRatioCh1" "EntropyIntenCh1"        "EntropyIntenCh3"       
## [10] "EntropyIntenCh4"        "ShapeLWRCh1"            "EqEllipseProlateVolCh1"
## [13] "FiberAlign2Ch3"         "FiberAlign2Ch4"         "FiberLengthCh1"        
## [16] "FiberWidthCh1"          "IntenCoocASMCh3"        "IntenCoocEntropyCh4"   
## [19] "IntenCoocContrastCh4"   "SkewIntenCh1"           "KurtIntenCh3"          
## [22] "SkewIntenCh4"           "NeighborVarDistCh1"     "NeighborMinDistCh1"    
## [25] "ShapeBFRCh1"            "SpotFiberCountCh3"      "SpotFiberCountCh4"     
## [28] "TotalIntenCh3"          "XCentroid"              "YCentroid"