The field of machine learning is concerned with the question of how to construct computer programs that automatically improve with experience. A computer program is said to learn from experience E with respect to some class of tasks T and performance measure P, if its performance at tasks in T, as measured by P, improves with experience E. – Mitchell’s 《Machine Learning》
Vast amounts of data are being generated in many fields, and the statisticians’s job is to make sense of it all: to extract important patterns and trends, and to understand “what the data says”. We call this learning from data. – 《Elements of Statistical Learning》
Pattern recognition has its origins in engineering, whereas machine learning grew out of computer science. However, these activities can be viewed as two facets of the same field… – 《Pattern Recognition》
机器学习通常研究两大类问题:
- 有目标变量的有监督学习,通常用来预测未来
- 分类 Classification
- 回归 Regression
- 异常检测 Deviation Detection
- 没有目标变量的无监督学习,通常用来描述现在
- 聚类 Clustering
- 关联规则 Association Rule Discovery
- 序列挖掘 Sequential Pettern Discovery
1. 问题描述及理解
2. 数据描述及理解
3. 数据准备
数据准备步骤是在数据理解的基础上对数据进行整理转换,以方便输入到建模算法中。具体任务如下:
prcomp() need to center and scale, preProcess() %>% predict()e1071::skewness() hist() lattice::histogram, caret::BoxCoxTrans() %>% predict(), MASS::boxcox() preProcess() %>% predict()…Scale(),caret::preProces() %>% predict()…caret::dummyVars() %>% predict()impute::impute.knn(), preProcess() %>% predict4. 数据建模
5. 模型评估
6. 模型部署
- 模拟生成的数据
- 模拟生成的数据产生自下面的模型 \begin{equation} \left\{ \begin{array}{ll} X \sim \mathcal{N}(0, I), \qquad Y|X=x\sim Bernoulli(p(x))\\ \log(\frac{p(x)}{1-p(x)}) = X^{T} \beta + \epsilon \end{array} \right. \end{equation} where the model’s \(\beta=(8, 1, 10, 3, 4, 6, 6, 4, 9, 4, 1, 7, 1, 9, 7, 3, 5, 4, 10, 4)\), \(I\) is a 10 dimensions identity matrix, the error term \(\epsilon\) follows standard normal distribution.
- 现实数据(German Credit Data)
- iris data
- PimaIndiansDiabetes2 Data Set(mlbench package)
library(MASS)
I = diag(x = 10)
X = mvrnorm(n = 10000, mu = rep(0, 10), Sigma = I)
beta = c(8, 6, 7, 4, 3, 2, 1, 2, 6, 1)
epsilon = rnorm(n = 10000, mean = 0, sd = 1)
linear_y = X %*% beta + epsilon
p = 1 / (1 + exp(-linear_y))
y = rbinom(n = 10000, size = 1, prob = p)
X = as.data.frame(X)
names(X) = paste0(rep("x", 10), 1:10)
simulation = cbind(X, y)
head(simulation)
## x1 x2 x3 x4 x5 x6
## 1 -1.6932736 1.6830843 -0.9623125 0.9361958 0.41037303 0.50358753
## 2 -1.4530986 -1.8877910 -0.8712916 1.5606637 -2.15229826 2.58383322
## 3 -1.0352852 -0.2828999 0.7361335 -0.1915065 0.80948616 -0.87006833
## 4 1.0964669 -1.1784492 0.7327553 -0.8175988 -0.51899757 0.05863843
## 5 1.2579615 -1.3201197 -1.0677687 -0.1280080 0.60316412 0.35847624
## 6 0.4700659 0.5466436 0.3385814 0.4856115 -0.03597478 -1.01484927
## x7 x8 x9 x10 y
## 1 1.1937871 -0.47722195 1.1556607 -2.063981020 0
## 2 0.5857647 1.18473839 -0.8039231 0.002948212 0
## 3 -1.9221342 0.80276078 -1.4348059 -0.831977271 0
## 4 -0.1780196 0.50609122 -0.3736140 -2.162454528 0
## 5 1.0082632 -0.30488880 1.3697818 1.158582193 1
## 6 -0.2115944 0.06002148 -0.1596358 -0.244158547 1
str(simulation)
## 'data.frame': 10000 obs. of 11 variables:
## $ x1 : num -1.69 -1.45 -1.04 1.1 1.26 ...
## $ x2 : num 1.683 -1.888 -0.283 -1.178 -1.32 ...
## $ x3 : num -0.962 -0.871 0.736 0.733 -1.068 ...
## $ x4 : num 0.936 1.561 -0.192 -0.818 -0.128 ...
## $ x5 : num 0.41 -2.152 0.809 -0.519 0.603 ...
## $ x6 : num 0.5036 2.5838 -0.8701 0.0586 0.3585 ...
## $ x7 : num 1.194 0.586 -1.922 -0.178 1.008 ...
## $ x8 : num -0.477 1.185 0.803 0.506 -0.305 ...
## $ x9 : num 1.156 -0.804 -1.435 -0.374 1.37 ...
## $ x10: num -2.06398 0.00295 -0.83198 -2.16245 1.15858 ...
## $ y : int 0 0 0 0 1 1 1 0 0 1 ...
credit = read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data",
header = FALSE,
sep = "")
names(credit) = c(paste0(rep("x", 20), 1:20), "y")
credit$y = as.integer(credit$y) - 1
library(magrittr)
credit = sapply(credit, as.integer) %>%
as.data.frame() %>%
sapply(function(x) x/max(x)) %>%
as.data.frame()
head(credit)
## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12
## 1 0.25 0.08333333 1.0 0.5 0.06344985 1.0 1.0 1.00 0.75 0.3333333 1.00 0.25
## 2 0.50 0.66666667 0.6 0.5 0.32300261 0.2 0.6 0.50 0.50 0.3333333 0.50 0.25
## 3 1.00 0.16666667 1.0 0.8 0.11376465 0.2 0.8 0.50 0.75 0.3333333 0.75 0.25
## 4 0.25 0.58333333 0.6 0.4 0.42781155 0.2 0.8 0.50 0.75 1.0000000 1.00 0.50
## 5 0.25 0.33333333 0.8 0.1 0.26432914 0.2 0.6 0.75 0.75 0.3333333 1.00 1.00
## 6 1.00 0.50000000 0.6 0.8 0.49147851 1.0 0.6 0.50 0.75 0.3333333 1.00 1.00
## x13 x14 x15 x16 x17 x18 x19 x20 y
## 1 0.8933333 1 0.6666667 0.50 0.75 0.5 1.0 0.5 0
## 2 0.2933333 1 0.6666667 0.25 0.75 0.5 0.5 0.5 1
## 3 0.6533333 1 0.6666667 0.25 0.50 1.0 0.5 0.5 0
## 4 0.6000000 1 1.0000000 0.25 0.75 1.0 0.5 0.5 0
## 5 0.7066667 1 1.0000000 0.50 0.75 1.0 0.5 0.5 1
## 6 0.4666667 1 1.0000000 0.25 0.50 1.0 1.0 0.5 0
str(credit)
## 'data.frame': 1000 obs. of 21 variables:
## $ x1 : num 0.25 0.5 1 0.25 0.25 1 1 0.5 1 0.5 ...
## $ x2 : num 0.0833 0.6667 0.1667 0.5833 0.3333 ...
## $ x3 : num 1 0.6 1 0.6 0.8 0.6 0.6 0.6 0.6 1 ...
## $ x4 : num 0.5 0.5 0.8 0.4 0.1 0.8 0.4 0.2 0.5 0.1 ...
## $ x5 : num 0.0634 0.323 0.1138 0.4278 0.2643 ...
## $ x6 : num 1 0.2 0.2 0.2 0.2 1 0.6 0.2 0.8 0.2 ...
## $ x7 : num 1 0.6 0.8 0.8 0.6 0.6 1 0.6 0.8 0.2 ...
## $ x8 : num 1 0.5 0.5 0.5 0.75 0.5 0.75 0.5 0.5 1 ...
## $ x9 : num 0.75 0.5 0.75 0.75 0.75 0.75 0.75 0.75 0.25 1 ...
## $ x10: num 0.333 0.333 0.333 1 0.333 ...
## $ x11: num 1 0.5 0.75 1 1 1 1 0.5 1 0.5 ...
## $ x12: num 0.25 0.25 0.25 0.5 1 1 0.5 0.75 0.25 0.75 ...
## $ x13: num 0.893 0.293 0.653 0.6 0.707 ...
## $ x14: num 1 1 1 1 1 1 1 1 1 1 ...
## $ x15: num 0.667 0.667 0.667 1 1 ...
## $ x16: num 0.5 0.25 0.25 0.25 0.5 0.25 0.25 0.25 0.25 0.5 ...
## $ x17: num 0.75 0.75 0.5 0.75 0.75 0.5 0.75 1 0.5 1 ...
## $ x18: num 0.5 0.5 1 1 1 1 0.5 0.5 0.5 0.5 ...
## $ x19: num 1 0.5 0.5 0.5 0.5 1 0.5 1 0.5 0.5 ...
## $ x20: num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## $ y : num 0 1 0 0 1 0 0 0 0 1 ...
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
set.seed(1)
data(PimaIndiansDiabetes2, package = "mlbench")
data <- PimaIndiansDiabetes2
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
# 标准化
preProcValue <- preProcess(data[, -9], method = c("center", "scale"))
scaleddata <- predict(preProcValue, data[, -9])
# YeoJohnson转换,是数据接近正态分布,并减弱异常值的影响
preProcbox <- preProcess(scaleddata, method = c("YeoJohnson"))
boxdata <- predict(preProcbox, scaleddata)
# 缺失值插补,使用装袋算法
library(ipred)
preProcimp <- preProcess(boxdata, method = "bagImpute")
procdata <- predict(preProcimp, boxdata)
procdata$class <- data[, 9]
head(procdata)
## pregnant glucose pressure triceps insulin mass
## 1 0.5284016 0.7595155 -0.03275471 0.52823166 0.31794703 0.1613449
## 2 -1.0902050 -1.4250227 -0.52420088 -0.01466832 -1.22004921 -0.9327976
## 3 0.8956985 1.5845945 -0.69028150 -1.01637841 0.02420507 -1.5205042
## 4 -1.0902050 -1.2509263 -0.52420088 -0.62258605 -0.67266546 -0.6791257
## 5 -1.5823833 0.4627602 -2.74133178 0.52823166 0.09906180 1.3218244
## 6 0.3065886 -0.1924995 0.12832774 -0.77660624 -0.41566226 -1.1067781
## pedigree age class
## 1 0.3807017 0.90154325 pos
## 2 -0.4347451 -0.20794297 neg
## 3 0.4674736 -0.11085497 pos
## 4 -1.3678689 -1.55542743 neg
## 5 1.7918753 -0.02068449 pos
## 6 -1.1705718 -0.31192824 neg
str(procdata)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 0.528 -1.09 0.896 -1.09 -1.582 ...
## $ glucose : num 0.76 -1.425 1.585 -1.251 0.463 ...
## $ pressure: num -0.0328 -0.5242 -0.6903 -0.5242 -2.7413 ...
## $ triceps : num 0.5282 -0.0147 -1.0164 -0.6226 0.5282 ...
## $ insulin : num 0.3179 -1.22 0.0242 -0.6727 0.0991 ...
## $ mass : num 0.161 -0.933 -1.521 -0.679 1.322 ...
## $ pedigree: num 0.381 -0.435 0.467 -1.368 1.792 ...
## $ age : num 0.9015 -0.2079 -0.1109 -1.5554 -0.0207 ...
## $ class : Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...