课程回放:(https://www.cctalk.com/v/15407237534698?xh_fshareuid=87b7f11f-6ceb-c339-6ae2-1632fe5f61ea&xh_preshareid=d4d94a43-9b8b-4a59-ae17-7aadf5918469)

导入数据:

Salaries <- read.csv("https://www.dropbox.com/s/h4wny8slh4x1v1s/Salaries.csv?dl=1" ) 

查看数据结构:

str(Salaries)
## 'data.frame':    397 obs. of  6 variables:
##  $ rank         : Factor w/ 3 levels "AssocProf","AsstProf",..: 3 3 2 3 3 1 3 3 3 3 ...
##  $ discipline   : Factor w/ 2 levels "A","B": 2 2 2 2 2 2 2 2 2 2 ...
##  $ yrs.since.phd: int  19 20 4 45 40 6 30 45 21 18 ...
##  $ yrs.service  : int  18 16 3 39 41 6 23 45 20 18 ...
##  $ sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 1 ...
##  $ salary       : int  139750 173200 79750 115000 141500 97000 175000 147765 119250 129000 ...

回顾取子集的方法:

Salaries$yrs.service        # yrs.service variable
##   [1] 18 16  3 39 41  6 23 45 20 18  8  2  1  0 18  3 20 34 23 36 26 31 30
##  [24] 19  8  8 23  3  0  8  4  2  9  2  2  0 21  4 31  9  2 23 27 38 19 15
##  [47] 28 19 25  1 28 11  3  9 11  5 21  8  9  3  8  2 31 11  3  8 12 31 17
##  [70] 36  2 45 19 34 23  3  3 19  1  2 28 16 20  2 18 14 37  2 25  7  5  7
##  [93]  7 38 20  0 12  7 14 26 25 23  5 14 10 28  8  8  8 31 16 16  1 37  0
## [116]  9 29 36  1  3 14 32 22 22 22 49 26  0 30  2  9 57  8  1 25 18 14 14
## [139]  7 18  8 10 11  3 27 28  4 27 26  3 12  4  9 10  0 21 18  0  6 16  2
## [162] 19  7  3  0  8 16 19  6 18  5 19 24 20  6 25  7  9 14  3 11  5  8 22
## [185] 23 30 10 10 28 19  9 22 18 19 53  7  4  4 33 22  4 40 17 17  5  2 33
## [208] 18  2 20  3 39  7 19  1 11 11 22  7 11 21 10  6 20 35 20  1  7 11 38
## [231] 27 24 19 19  3 17 25  6 40  6  3 30 37 23 23 11 23 18 23  7 39  8 12
## [254]  2  7  8 22 23  3 30 33 45 26 31 35 30 43 10 44  7 40 18  1  4  3  6
## [277] 48 27 18 46 38 27 51 43  6 49 27  0 27  5  7 28  9  1  7 36 18 11 43
## [300] 39 36 16 13  4 44 31  4 28  0 15  7  9 19 35  6  3  9 45 16 15 23  9
## [323] 11 15 31  4 15 37 10 23 60  9 10 19  6 38 23 12 25 15 11 17 38 31 35
## [346] 10 27 33  3 28 49 38 27 20  1 21 40 35 14  4 11 15 30 17 43 40 10  1
## [369] 30 31  8 20  7 26 19 26  1  3 38  8  3 23  5 44 21  9 27 15 36 18 19
## [392] 19 30 19 25 15  4
Salaries[2,]                # second row, observation
##   rank discipline yrs.since.phd yrs.service  sex salary
## 2 Prof          B            20          16 Male 173200
Salaries$discipline[6]      # 6th observation of the discipline variable
## [1] B
## Levels: A B
Salaries[5,"yrs.service"]   # row 5 of the yrs.service variable
## [1] 41
Salaries[4,3]               # row 4, column 3
## [1] 45
Salaries[2:6,]              # observations 2 through 6
##        rank discipline yrs.since.phd yrs.service  sex salary
## 2      Prof          B            20          16 Male 173200
## 3  AsstProf          B             4           3 Male  79750
## 4      Prof          B            45          39 Male 115000
## 5      Prof          B            40          41 Male 141500
## 6 AssocProf          B             6           6 Male  97000
Salaries[c(7,9,16),]        # observations 7, 9, and 16
##    rank discipline yrs.since.phd yrs.service  sex salary
## 7  Prof          B            30          23 Male 175000
## 9  Prof          B            21          20 Male 119250
## 16 Prof          B            12           3 Male 117150

使用sum函数,结合is.na()查看是否存在缺失值:

sum( is.na(Salaries) )
## [1] 0
sum( Salaries=="" )
## [1] 0

查看数据的描述性统计:

summary(Salaries)
##         rank     discipline yrs.since.phd    yrs.service        sex     
##  AssocProf: 64   A:181      Min.   : 1.00   Min.   : 0.00   Female: 39  
##  AsstProf : 67   B:216      1st Qu.:12.00   1st Qu.: 7.00   Male  :358  
##  Prof     :266              Median :21.00   Median :16.00               
##                             Mean   :22.31   Mean   :17.61               
##                             3rd Qu.:32.00   3rd Qu.:27.00               
##                             Max.   :56.00   Max.   :60.00               
##      salary      
##  Min.   : 57800  
##  1st Qu.: 91000  
##  Median :107300  
##  Mean   :113706  
##  3rd Qu.:134185  
##  Max.   :231545

把工资变量对数化,存成新变量logSalary:

logSalary <- log(Salaries$salary)

将新变量加入到原来的数据:

Salaries <- data.frame(Salaries,
 logSalary = logSalary
)

再次查看数据的描述性统计:

summary(Salaries)
##         rank     discipline yrs.since.phd    yrs.service        sex     
##  AssocProf: 64   A:181      Min.   : 1.00   Min.   : 0.00   Female: 39  
##  AsstProf : 67   B:216      1st Qu.:12.00   1st Qu.: 7.00   Male  :358  
##  Prof     :266              Median :21.00   Median :16.00               
##                             Mean   :22.31   Mean   :17.61               
##                             3rd Qu.:32.00   3rd Qu.:27.00               
##                             Max.   :56.00   Max.   :60.00               
##      salary         logSalary    
##  Min.   : 57800   Min.   :10.96  
##  1st Qu.: 91000   1st Qu.:11.42  
##  Median :107300   Median :11.58  
##  Mean   :113706   Mean   :11.61  
##  3rd Qu.:134185   3rd Qu.:11.81  
##  Max.   :231545   Max.   :12.35

将教授级别变成有序分类变量

Salaries$rank <- ordered(Salaries$rank, levels=c("AsstProf","AssocProf","Prof"))

教授工资与其工作时间(年)

library(ggplot2)
ggplot(data=Salaries, aes(x=yrs.service, y=salary)) +
  geom_point() +
  theme_bw() +
  ggtitle("Professor's salaries from 2008-9") 

细节改进:横轴,纵轴标签

library(ggplot2)
ggplot(data=Salaries, aes(x=yrs.service, y=salary)) +
  geom_point() +
  theme_bw() +
  ggtitle("Professor's salaries from 2008-9") +
  theme( plot.title=element_text(vjust=1.0) ) +
  xlab("Years of service") +
  theme( axis.title.x = element_text(vjust=-.5) ) +
  ylab("Salary in thousands of dollars") +
  theme( axis.title.y = element_text(vjust=1.0) ) 

细节改进:按教授级别分别作图,更改背景

library(ggplot2)
plotSalFacRank <- ggplot(data=Salaries, aes(x=yrs.service, y=salary)) +
  geom_point(aes(color=rank)) +
  theme_bw() +
  ggtitle("Professor's salaries from 2008-9") +
  theme( plot.title=element_text(vjust=1.0) ) +
  xlab("Years of service") +
  theme( axis.title.x = element_text(vjust=-.5) ) +
  ylab("Salary in thousands of dollars") +
  theme( axis.title.y = element_text(vjust=1.0) ) +
  facet_wrap(~rank) +
  theme(strip.background = element_rect(fill = "White"))
plotSalFacRank

散点图:添加图表标题,在底部添加图例

library(ggplot2)
ggplot(data=Salaries, aes(x=yrs.service, y=salary)) +
  geom_point(aes(color=rank)) +
  theme_bw() +
  ggtitle("Professor's salaries from 2008-9") +
  theme( plot.title=element_text(vjust=1.0) ) +
  xlab("Years of service") +
  theme( axis.title.x = element_text(vjust=-.5) ) +
  ylab("Salary in thousands of dollars") +
  theme( axis.title.y = element_text(vjust=1.0) ) +
  theme(legend.position = "bottom")

简单回归模型

out <- lm(salary~sex,data=Salaries)
summary(out)
## 
## Call:
## lm(formula = salary ~ sex, data = Salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57290 -23502  -6828  19710 116455 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   101002       4809  21.001  < 2e-16 ***
## sexMale        14088       5065   2.782  0.00567 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30030 on 395 degrees of freedom
## Multiple R-squared:  0.01921,    Adjusted R-squared:  0.01673 
## F-statistic: 7.738 on 1 and 395 DF,  p-value: 0.005667

模型评估:adjusted R squared, BIC, AIC 从最全的模型开始 工资~学科+性别+级别+工作年数+年数平方 工资~学科+性别+级别+取得博士之后的年数+取得博士之后年数平方

哪个模型好?比较调整后的r方

Salaries$yrs.serviceSqr <- Salaries$yrs.service^2

outSer <- lm(salary~discipline + sex + rank + yrs.service+yrs.serviceSqr,data=Salaries)
summary(outSer)
## 
## Call:
## lm(formula = salary ~ discipline + sex + rank + yrs.service + 
##     yrs.serviceSqr, data = Salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -62609 -14400  -1291  10408  99358 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    87562.830   4767.001  18.369  < 2e-16 ***
## disciplineB    13424.868   2317.296   5.793 1.42e-08 ***
## sexMale         4924.641   3884.345   1.268  0.20562    
## rank.L         33134.903   3375.817   9.815  < 2e-16 ***
## rank.Q          8450.673   2660.705   3.176  0.00161 ** 
## yrs.service      214.267    390.771   0.548  0.58379    
## yrs.serviceSqr    -6.058      7.485  -0.809  0.41885    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22660 on 390 degrees of freedom
## Multiple R-squared:  0.4487, Adjusted R-squared:  0.4402 
## F-statistic: 52.91 on 6 and 390 DF,  p-value: < 2.2e-16
Salaries$yrs.since.phdSqr <- Salaries$yrs.since.phd^2

outSin <- lm(salary~discipline + sex + rank + yrs.since.phd+yrs.since.phdSqr,data=Salaries)
summary(outSin)
## 
## Call:
## lm(formula = salary ~ discipline + sex + rank + yrs.since.phd + 
##     yrs.since.phdSqr, data = Salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -62956 -13315  -1405   9831  96306 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      73596.082   6871.669  10.710  < 2e-16 ***
## disciplineB      14199.904   2331.061   6.092 2.68e-09 ***
## sexMale           5233.598   3860.948   1.356 0.176036    
## rank.L           24112.962   4372.761   5.514 6.38e-08 ***
## rank.Q            9389.186   2667.936   3.519 0.000484 ***
## yrs.since.phd     1512.625    565.503   2.675 0.007791 ** 
## yrs.since.phdSqr   -25.037      9.508  -2.633 0.008795 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22490 on 390 degrees of freedom
## Multiple R-squared:  0.4569, Adjusted R-squared:  0.4485 
## F-statistic: 54.68 on 6 and 390 DF,  p-value: < 2.2e-16

有可以移除的变量吗?,使用step函数 最好的模型(AIC最小):

step(outSin)
## Start:  AIC=7963.59
## salary ~ discipline + sex + rank + yrs.since.phd + yrs.since.phdSqr
## 
##                    Df  Sum of Sq        RSS    AIC
## - sex               1 9.2964e+08 1.9825e+11 7963.5
## <none>                           1.9732e+11 7963.6
## - yrs.since.phdSqr  1 3.5081e+09 2.0083e+11 7968.6
## - yrs.since.phd     1 3.6199e+09 2.0094e+11 7968.8
## - discipline        1 1.8774e+10 2.1609e+11 7997.7
## - rank              2 2.8413e+10 2.2573e+11 8013.0
## 
## Step:  AIC=7963.45
## salary ~ discipline + rank + yrs.since.phd + yrs.since.phdSqr
## 
##                    Df  Sum of Sq        RSS    AIC
## <none>                           1.9825e+11 7963.5
## - yrs.since.phdSqr  1 3.2254e+09 2.0147e+11 7967.9
## - yrs.since.phd     1 3.3910e+09 2.0164e+11 7968.2
## - discipline        1 1.9050e+10 2.1730e+11 7997.9
## - rank              2 2.9766e+10 2.2801e+11 8015.0
## 
## Call:
## lm(formula = salary ~ discipline + rank + yrs.since.phd + yrs.since.phdSqr, 
##     data = Salaries)
## 
## Coefficients:
##      (Intercept)       disciplineB            rank.L            rank.Q  
##         78469.60          14297.08          24641.88           9490.59  
##    yrs.since.phd  yrs.since.phdSqr  
##          1460.66            -23.92
best<-lm(salary~discipline + rank + yrs.service+yrs.serviceSqr,data=Salaries)

模型预测:

set.seed(100) 
trainingRowIndex <- sample(1:nrow(Salaries), 0.8*nrow(Salaries))
trainingData <- Salaries[trainingRowIndex, ] # model training data
testData <- Salaries[-trainingRowIndex, ] # test data

然后,用训练数据建立一个模型

bestMod <- lm(salary~discipline + rank + yrs.service+yrs.serviceSqr,data=trainingData) # build the model

再然后,将我们建立的模型用来测试数据上,做预测

salaryPred <- predict(bestMod, testData) # predict distance

检验一下,我们的预测效果

actuals_preds <- data.frame(cbind(actuals=testData$salary, predicteds=salaryPred)) # make actuals_predicteds dataframe.

correlation_accuracy <- cor(actuals_preds$actuals,actuals_preds$predicteds) 
correlation_accuracy
## [1] 0.6137226

读取数据

advertising <- read.csv("https://www.dropbox.com/s/m6jh5kspianm215/advertising.csv?dl=1",sep="\t")

#advertising
colSums(is.na(advertising))
## Television      radio  newspaper      sales 
##          0          0          0          0
summary(advertising)
##    Television         radio          newspaper          sales      
##  Min.   :  0.70   Min.   : 0.000   Min.   :  0.30   Min.   : 1.60  
##  1st Qu.: 74.38   1st Qu.: 9.975   1st Qu.: 12.75   1st Qu.:10.47  
##  Median :149.75   Median :22.900   Median : 25.75   Median :13.05  
##  Mean   :147.04   Mean   :23.264   Mean   : 30.55   Mean   :14.40  
##  3rd Qu.:218.82   3rd Qu.:36.525   3rd Qu.: 45.10   3rd Qu.:17.70  
##  Max.   :296.40   Max.   :49.600   Max.   :114.00   Max.   :50.00
par(mfrow=c(1, 2))  # it divides graph area in two parts

boxplot(advertising$sales, col = "yellow", border="blue",
        main = "SALES boxplot",
        ylab = "g per decaliter")

boxplot(advertising$Television, col = "orange", border="blue",
        main = "Television boxplot",
        ylab = "percent values")

找出异常值

# sales outliers
boxplot.stats(advertising$sales)$out 
## [1] 50 45

sales直方图

library(ggplot2)
qplot(sales, data = advertising, geom="histogram", binwidth=1, 
      fill=I("azure4"), col=I("azure3")) +
  labs(title = "sales") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x ="sales (units)") +
  labs(y = "Frequency") +
  scale_y_continuous(breaks = c(0,2,4,6,8,10,12,14,16,18), minor_breaks = NULL) +
  scale_x_continuous(breaks = c(1,5,10,20,30,50), minor_breaks = 5) +
  geom_vline(xintercept = mean(advertising$sales), show_guide=TRUE, color
             ="red", labels="Average") +
  geom_vline(xintercept = median(advertising$sales), show_guide=TRUE, color
             ="blue", labels="Median")
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.
## Warning: Ignoring unknown parameters: labels
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.
## Warning: Ignoring unknown parameters: labels

Television 直方图

library(ggplot2)
qplot(Television, data = advertising, geom="histogram", binwidth=5, 
      fill=I("azure4"), col=I("azure3")) +
  labs(title = "Television") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x ="AD budget in Television (in ten thousand dollars)") +
  labs(y = "Frequency") +
  scale_y_continuous(breaks = c(0,2,4,6,8,10,12,14,16,18), minor_breaks = NULL) +
  scale_x_continuous(limits = c(0, 300)) +
  geom_vline(xintercept = mean(advertising$Television), show_guide=TRUE, color
             ="red", labels="Average") +
  geom_vline(xintercept = median(advertising$Television), show_guide=TRUE, color
             ="blue", labels="Median")
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.
## Warning: Ignoring unknown parameters: labels
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.
## Warning: Ignoring unknown parameters: labels

散点图

library(ggplot2)
ggplot(advertising, aes(x=Television, y=sales))+
  geom_point(colour = "blue", size = 1.5)+
  scale_y_continuous(limits=c(0,50))+
  scale_x_continuous(limits=c(0,300))+
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("Television ad budget and sales relationship")

在散点图基础上添加线性回归拟合线

library(ggplot2)
ggplot(advertising, aes(x=Television, y=sales))+
  geom_point(colour = "blue", size = 1.5)+
  scale_y_continuous(limits=c(0,50))+
  scale_x_continuous(limits=c(0,300))+
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("Television ad budget and sales relationship")+
  geom_smooth(method="lm", color="red")

建立模型

model1 <- lm(sales ~ Television, data = advertising)
summary(model1)
## 
## Call:
## lm(formula = sales ~ Television, data = advertising)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.432 -2.366 -0.440  1.659 37.827 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.788117   0.679970   11.45   <2e-16 ***
## Television  0.044973   0.003996   11.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.84 on 198 degrees of freedom
## Multiple R-squared:  0.3901, Adjusted R-squared:  0.3871 
## F-statistic: 126.7 on 1 and 198 DF,  p-value: < 2.2e-16

回归诊断作图

#Residuals vs Fitted values
plot(model1, pch=16, col="blue", lty=1, lwd=2, which=1)

plot(model1, pch=16, col="blue", lty=1, lwd=2, which=4)

模型改进,标记异常值

advertising$outlier = ifelse(advertising$sales>40,"Y","N") # create condition Yes/No if outlier
library(ggplot2)
ggplot(data=advertising, aes(x=Television, y=sales, col=as.factor(outlier)))+
  geom_point()+
  ggtitle("Television ads and sales")+
  scale_y_continuous(limits=c(0,50))+
  scale_x_continuous(limits=c(0,300))+
  theme(plot.title = element_text(hjust = 0.5))

剔除异常值,创建新数据

ad_new<-subset(advertising, sales<40)

用新数据从新估计模型

model2 <- lm(sales ~ Television, data = ad_new)

新模型完爆旧模型:

summary(model2)
## 
## Call:
## lm(formula = sales ~ Television, data = ad_new)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3885 -1.8990 -0.1932  2.0506  7.1992 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.073270   0.461619   15.32   <2e-16 ***
## Television  0.047399   0.002704   17.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.269 on 196 degrees of freedom
## Multiple R-squared:  0.6106, Adjusted R-squared:  0.6086 
## F-statistic: 307.4 on 1 and 196 DF,  p-value: < 2.2e-16
summary(model1)
## 
## Call:
## lm(formula = sales ~ Television, data = advertising)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.432 -2.366 -0.440  1.659 37.827 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.788117   0.679970   11.45   <2e-16 ***
## Television  0.044973   0.003996   11.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.84 on 198 degrees of freedom
## Multiple R-squared:  0.3901, Adjusted R-squared:  0.3871 
## F-statistic: 126.7 on 1 and 198 DF,  p-value: < 2.2e-16
AIC(model2)
## [1] 1034.927
AIC(model1)
## [1] 1202.3
BIC(model2)
## [1] 1044.792
BIC(model1)
## [1] 1212.195

再应用一次机器学习方法:

set.seed(100)
trainingRowIndex <- sample(1:nrow(ad_new), 0.7*nrow(ad_new)) 

train <- ad_new[trainingRowIndex, ] # model training data
test <- ad_new[-trainingRowIndex, ] # test data

用训练数据建模,将得到的模型运用到测试数据来做预测

modTrain <- lm(sales ~ Television, data=train)  # build the model
predict <- predict(modTrain, test)  # predicted values
summary(modTrain)
## 
## Call:
## lm(formula = sales ~ Television, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.2929 -2.1245 -0.1316  2.0828  7.2758 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.044816   0.565180   12.46   <2e-16 ***
## Television  0.047156   0.003207   14.70   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.278 on 136 degrees of freedom
## Multiple R-squared:  0.6138, Adjusted R-squared:  0.611 
## F-statistic: 216.2 on 1 and 136 DF,  p-value: < 2.2e-16

将预测数据和实际数据放在一起(针对测试数据而言),计算相关系数,来衡量模型

act_pred <- data.frame(cbind(actuals=test$sales, predicteds=predict)) # actuals_predicteds 
cor(act_pred) # correlation_accuracy
##              actuals predicteds
## actuals    1.0000000  0.7743846
## predicteds 0.7743846  1.0000000

实际对比一下原始数据和预测数据

head(act_pred, n=10)
##    actuals predicteds
## 1     22.1  17.895452
## 6      7.2   7.455075
## 9      4.8   7.450359
## 10    10.6  16.466620
## 12    17.4  17.169247
## 15    19.0  16.669392
## 16    22.4  16.259133
## 17    12.5  10.242005
## 20    14.6  13.990921
## 21    18.0  17.343725

还可以再进一步,交叉验证:从上面6,4分的训练测试数据得到的模型表现不错,这是不是偶然碰巧的结果? 随机把数据分成K等份,K-1做训练,1做测试。得到k种结果,看看预测结果之间偏差如何? k- Fold Cross validation 曲线是否平行

library(DAAG)
## Loading required package: lattice
kfold <- CVlm(data = ad_new, form.lm = formula(sales ~ Television), m=5, 
                   dots = FALSE, seed=123, legend.pos="topleft",
                   main="Cross Validation; k=5",
                   plotit=TRUE, printit=FALSE)

逻辑回归模型 logistic model

Logistic regression involves fitting a curve to numeric data to make predictions about binary events. Arguably one of the most widely used machine learning methods。

我们这里有一个关于给瘫痪军人募款的数据库,我们想要建一个logistic regression model,来预测人们的慈善行为。

先看一下数据内容:

donors<-read.csv("https://www.dropbox.com/s/8w2bf93e4fqzyib/donors.csv?dl=1")
# Examine the dataset to identify potential independent variables
str(donors)
## 'data.frame':    93462 obs. of  13 variables:
##  $ donated          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ veteran          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bad_address      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age              : int  60 46 NA 70 78 NA 38 NA NA 65 ...
##  $ has_children     : int  0 1 0 0 1 0 1 0 0 0 ...
##  $ wealth_rating    : int  0 3 1 2 1 0 2 3 1 0 ...
##  $ interest_veterans: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ interest_religion: int  0 0 0 0 1 0 0 0 0 0 ...
##  $ pet_owner        : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ catalog_shopper  : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ recency          : Factor w/ 2 levels "CURRENT","LAPSED": 1 1 1 1 1 1 1 1 1 1 ...
##  $ frequency        : Factor w/ 2 levels "FREQUENT","INFREQUENT": 1 1 1 1 1 2 2 1 2 2 ...
##  $ money            : Factor w/ 2 levels "HIGH","MEDIUM": 2 1 2 2 2 2 2 2 2 2 ...
# Explore the dependent variable
table(donors$donated)
## 
##     0     1 
## 88751  4711

我们采用如下几个变量来预测一个人是否捐赠:

donors<-read.csv("https://www.dropbox.com/s/8w2bf93e4fqzyib/donors.csv?dl=1")

# Build the donation model
donation_model <- glm(donated ~ bad_address + interest_religion + interest_veterans, 
                      data = donors, family = "binomial")

# Summarize the model results
summary(donation_model)
## 
## Call:
## glm(formula = donated ~ bad_address + interest_religion + interest_veterans, 
##     family = "binomial", data = donors)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3480  -0.3192  -0.3192  -0.3192   2.5678  
## 
## Coefficients:
##                   Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)       -2.95139    0.01652 -178.664   <2e-16 ***
## bad_address       -0.30780    0.14348   -2.145   0.0319 *  
## interest_religion  0.06724    0.05069    1.327   0.1847    
## interest_veterans  0.11009    0.04676    2.354   0.0186 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37330  on 93461  degrees of freedom
## Residual deviance: 37316  on 93458  degrees of freedom
## AIC: 37324
## 
## Number of Fisher Scoring iterations: 5

他会捐款吗?我们来预测一下吧!

# Estimate the donation probability
donors$donation_prob <- predict(donation_model, type = "response")

# Find the donation probability of the average prospect
mean(donors$donated)
## [1] 0.05040551
# Predict a donation if probability of donation is greater than average (0.0504)
donors$donation_pred <- ifelse(donors$donation_prob > 0.0504, 1, 0)

# Calculate the model's accuracy
mean(donors$donated == donors$donation_pred)
## [1] 0.794815