### 泰坦尼克号乘客生存分析
rm(list = ls())
### 导入包
library(glmnet)
## 载入需要的程辑包:Matrix
## Loaded glmnet 4.1-8
library(dplyr)
##
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(GGally)
## Warning: 程辑包'GGally'是用R版本4.3.3 来建造的
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# 帮助函数,用来辅助画好看的pair plot
my_bin <- function(data, mapping, ..., low = "#132B43", high = "#56B1F7") {
ggplot(data = data, mapping = mapping) +
geom_bin2d(...) +
scale_fill_gradient(low = low, high = high)
}
##############################################################
# 1. 熟悉数据
#### 读取数据 `titanicpassengers-bbm.dat` 到 `tit`
tit = read.csv('titanicpassengers-bbm.dat')
tit = tit %>% mutate(Passenger.Class = factor(Passenger.Class))
dim(tit) #查看数据维度
## [1] 1309 10
names(tit) #查看数据有哪些变量
## [1] "Name" "Survived" "Passenger.Class"
## [4] "Sex" "Age" "Siblings.and.Spouses"
## [7] "Parents.and.Children" "Fare" "Port"
## [10] "Home...Destination"
View(tit) #查看数据
data.frame(sapply(tit, function(x) sum(is.na(x)))) #检查数据的缺失情况
## sapply.tit..function.x..sum.is.na.x...
## Name 0
## Survived 0
## Passenger.Class 0
## Sex 0
## Age 263
## Siblings.and.Spouses 0
## Parents.and.Children 0
## Fare 1
## Port 0
## Home...Destination 0
#### 研究下面3个变量之间的关系:性别,船票等级,和是否存活。
tit.temp = tit %>% mutate(Survived = ifelse(Survived == "Yes",1,0))
tit.temp1 <- tit.temp %>% group_by(Passenger.Class,Sex) %>%
summarise_at(vars(Survived),list(~ mean(.,na.rm=TRUE)))
rm(tit.temp)
ggplot(tit.temp1, aes(x = Passenger.Class, y = Sex)) +
geom_point(alpha=0.5, aes(size = Survived, color = Survived)) +
scale_color_gradientn(colours = rev(terrain.colors(7)[1:5])) +
geom_text(aes(label = paste0(round(Survived*100,2),"%")),size = 2) +
scale_size_continuous(range = c(20, 50)) +
theme(legend.position = "none") +
ggtitle("Survival Rate of Passagers")

#### 我们也可以使用 'ggpairs' 来研究变量之间的关系。
tit[,c(2:7,9)] %>% na.omit() %>%
ggpairs(progress = FALSE, lower=list(combo=wrap("facethist", binwidth=1),
continuous=wrap(my_bin, binwidth=0.25))) +
theme(axis.ticks=element_blank(),
axis.line=element_blank())

#### 考虑一部分变量。
tit[,c(2:5,9)] %>% na.omit() %>%
ggpairs(progress = FALSE, lower=list(combo=wrap("facethist", binwidth=1),
continuous=wrap(my_bin, binwidth=0.25))) +
theme(axis.ticks=element_blank(),
axis.line=element_blank())

# 2. 多元逻辑回归
#### 对于逻辑回归模型,因变量是分类变量,我们需要对其进行处理。
tit = tit %>% mutate(Survived = factor(Survived, levels = c("No", "Yes")))
#### 为了避免出现问题,我们将删除所有有缺失值的乘客信息。
tit.comp <- na.omit(tit)
dim(tit.comp)
## [1] 1045 10
## a. 用所有的变量
### i. 全模型:考虑所有的变量
fit1 <- glm(Survived~., tit.comp[,-c(1,10)], family=binomial(logit))
summary(fit1)
##
## Call:
## glm(formula = Survived ~ ., family = binomial(logit), data = tit.comp[,
## -c(1, 10)])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.551e+01 3.707e+02 0.042 0.96662
## Passenger.Class2 -1.109e+00 2.693e-01 -4.120 3.79e-05 ***
## Passenger.Class3 -2.052e+00 2.778e-01 -7.387 1.50e-13 ***
## Sexmale -2.613e+00 1.796e-01 -14.550 < 2e-16 ***
## Age -3.812e-02 6.724e-03 -5.670 1.43e-08 ***
## Siblings.and.Spouses -3.512e-01 1.087e-01 -3.231 0.00123 **
## Parents.and.Children 5.125e-02 1.042e-01 0.492 0.62275
## Fare 2.743e-04 1.975e-03 0.139 0.88954
## PortC -1.125e+01 3.707e+02 -0.030 0.97578
## PortQ -1.270e+01 3.707e+02 -0.034 0.97267
## PortS -1.191e+01 3.707e+02 -0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1413.57 on 1044 degrees of freedom
## Residual deviance: 954.57 on 1034 degrees of freedom
## AIC: 976.57
##
## Number of Fisher Scoring iterations: 12
#### 根据上面的结果,`登船港口(Port)`这个变量好像有问题。
table.habor <- data.frame(table(tit.comp$Port))
colnames(table.habor) <- c("Port","频数")
table.habor
## Port 频数
## 1 2
## 2 C 212
## 3 Q 50
## 4 S 781
#### 根据上面的输出,有两个观测值的登船港口是空的,但是并没有被做为确失值来对待。
#### 我们需要将它们替换成NA。
tit <- tit %>% mutate(Port = factor(Port, levels=c("S","C","Q")))
tit.comp2 <- na.omit(tit)
dim(tit.comp2)
## [1] 1043 10
#### 重新拟合模型,看和登船港口有关的系数发生了什么变化。
fit2 <- glm(Survived~., tit.comp2[,-c(1,10)], family=binomial(logit))
summary(fit2)
##
## Call:
## glm(formula = Survived ~ ., family = binomial(logit), data = tit.comp2[,
## -c(1, 10)])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.5950512 0.4111480 8.744 < 2e-16 ***
## Passenger.Class2 -1.1093557 0.2692636 -4.120 3.79e-05 ***
## Passenger.Class3 -2.0519804 0.2777922 -7.387 1.50e-13 ***
## Sexmale -2.6128646 0.1795723 -14.550 < 2e-16 ***
## Age -0.0381233 0.0067240 -5.670 1.43e-08 ***
## Siblings.and.Spouses -0.3512240 0.1086969 -3.231 0.00123 **
## Parents.and.Children 0.0512518 0.1041801 0.492 0.62275
## Fare 0.0002743 0.0019752 0.139 0.88954
## PortC 0.6620573 0.2154290 3.073 0.00212 **
## PortQ -0.7864951 0.4091805 -1.922 0.05459 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1409.99 on 1042 degrees of freedom
## Residual deviance: 954.57 on 1033 degrees of freedom
## AIC: 974.57
##
## Number of Fisher Scoring iterations: 5
#### 删除不显著的自变量`船上父母孩子数量(Siblings.and.Spouses)` 和 `船票价格(Fare)`。
fit3 <- glm(Survived~., tit.comp2[,-c(1,7:8,10)], family=binomial(logit))
summary(fit3)
##
## Call:
## glm(formula = Survived ~ ., family = binomial(logit), data = tit.comp2[,
## -c(1, 7:8, 10)])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.647197 0.377513 9.661 < 2e-16 ***
## Passenger.Class2 -1.126422 0.243779 -4.621 3.82e-06 ***
## Passenger.Class3 -2.069269 0.238929 -8.661 < 2e-16 ***
## Sexmale -2.632629 0.176375 -14.926 < 2e-16 ***
## Age -0.038306 0.006712 -5.707 1.15e-08 ***
## Siblings.and.Spouses -0.332316 0.103047 -3.225 0.00126 **
## PortC 0.668459 0.212694 3.143 0.00167 **
## PortQ -0.802769 0.408617 -1.965 0.04946 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1409.99 on 1042 degrees of freedom
## Residual deviance: 954.88 on 1035 degrees of freedom
## AIC: 970.88
##
## Number of Fisher Scoring iterations: 5
### ii. 是否需要这两个不显著的自变量`船上父母孩子数量`和`船票价格`?
#### 我们可以使用似然比检验,来判断是不是需要某些变量。
#### 原假设是小的模型是足够的,也就是说必须要这两个变量。这个检验和线性回归中的方差分析是类似的。
anova(fit3,fit2)
## Analysis of Deviance Table
##
## Model 1: Survived ~ Passenger.Class + Sex + Age + Siblings.and.Spouses +
## Port
## Model 2: Survived ~ Passenger.Class + Sex + Age + Siblings.and.Spouses +
## Parents.and.Children + Fare + Port
## Resid. Df Resid. Dev Df Deviance
## 1 1035 954.88
## 2 1033 954.57 2 0.3119
#### 我们也可以直接来计算卡方统计量,来做上述检验。
chi.sq.2 <- fit3$deviance-fit2$deviance
chi.sq.2
## [1] 0.3119023
pchisq(chi.sq.2, 2, lower.tail=FALSE)
## [1] 0.855601
#### 以上两个分析都说明我们只需要小的模型。
### b. 最后的模型
fit3 <- glm(Survived~., tit.comp2[,-c(1,7:8,10)], family=binomial(logit))
summary(fit3)
##
## Call:
## glm(formula = Survived ~ ., family = binomial(logit), data = tit.comp2[,
## -c(1, 7:8, 10)])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.647197 0.377513 9.661 < 2e-16 ***
## Passenger.Class2 -1.126422 0.243779 -4.621 3.82e-06 ***
## Passenger.Class3 -2.069269 0.238929 -8.661 < 2e-16 ***
## Sexmale -2.632629 0.176375 -14.926 < 2e-16 ***
## Age -0.038306 0.006712 -5.707 1.15e-08 ***
## Siblings.and.Spouses -0.332316 0.103047 -3.225 0.00126 **
## PortC 0.668459 0.212694 3.143 0.00167 **
## PortQ -0.802769 0.408617 -1.965 0.04946 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1409.99 on 1042 degrees of freedom
## Residual deviance: 954.88 on 1035 degrees of freedom
## AIC: 970.88
##
## Number of Fisher Scoring iterations: 5
#### 回归分析的系数可以用赔率比来解释。
### 练习:男性和女性的存活率差别?船票等级不同对应的差别?
#### 男女性存活率的差别
df.gender <- data.frame(tit[1:5,-c(2,4,7,8,10)])
colnames(df.gender)[1] <- "Case"
df.gender$Case <- c("A","B","C","D","E")
df.gender$Passenger.Class <- c(1,1,2,2,3) %>% factor()
df.gender$Age <- c(20,40,55,29,17)
df.gender$Siblings.and.Spouses <- c(1,0,3,6,1)
df.gender$Port <- c("S","Q","C","Q","S") %>% factor(levels=c("S","C","Q"))
df.gender
## Case Passenger.Class Age Siblings.and.Spouses Port
## 1 A 1 20 1 S
## 2 B 1 40 0 Q
## 3 C 2 55 3 C
## 4 D 2 29 6 Q
## 5 E 3 17 1 S
for (i in 1:5){
df.gender[i,"女性"] <- predict(fit3,c(df.gender[i,],Sex="female"),type='response')}
for (i in 1:5){
df.gender[i,"男性"] <- predict(fit3,c(df.gender[i,],Sex="male"),type='response')}
df.gender$差值 <- df.gender$女性 - df.gender$男性
df.gender$女性<- paste0((df.gender$女性*100)%>% round(2),"%")
df.gender$男性 <- paste0((df.gender$男性*100)%>% round(2),"%")
df.gender$差值 <- paste0((df.gender$差值*100)%>% round(2),"%")
row.names(df.gender) <- NULL
df.gender
## Case Passenger.Class Age Siblings.and.Spouses Port 女性 男性 差值
## 1 A 1 20 1 S 92.75% 47.9% 44.84%
## 2 B 1 40 0 Q 78.79% 21.07% 57.71%
## 3 C 2 55 3 C 52.14% 7.26% 44.87%
## 4 D 2 29 6 Q 19.99% 1.76% 18.23%
## 5 E 3 17 1 S 64.44% 11.52% 52.91%
#### 不同船票等级的存活率差别
df.class <- data.frame(tit[1:5,-c(2,3,7,8,10)])
colnames(df.class)[1] <- "Case"
df.class$Case <- c("A","B","C","D","E")
df.class$Sex <- c("female","male","female","female","male") %>% factor()
df.class$Age <- c(20,40,18,29,17)
df.class$Siblings.and.Spouses <- c(1,0,3,6,1)
df.class$Port <- c("S","Q","C","Q","S") %>% factor(levels=c("S","C","Q"))
df.class$Passenger.Class <- rep(1,5) %>% factor(levels=c(1,2,3))
for (i in 1:5){df.class[i,"一等船票"] <- predict(fit3,df.class[i,],type='response')}
df.class$Passenger.Class <- rep(2,5) %>% factor(levels=c(1,2,3))
for (i in 1:5){df.class[i,"二等船票"] <- predict(fit3,df.class[i,],type='response')}
df.class$Passenger.Class <- rep(3,5) %>% factor(levels=c(1,2,3))
for (i in 1:5){df.class[i,"三等船票"] <- predict(fit3,df.class[i,],type='response')}
df.class <- df.class[,-6]
df.class$一等船票 <- paste0((df.class$一等船票 * 100) %>% round(2), "%")
df.class$二等船票 <- paste0((df.class$二等船票 * 100) %>% round(2), "%")
df.class$三等船票 <- paste0((df.class$三等船票 * 100) %>% round(2), "%")
row.names(df.class) <- NULL
df.class
## Case Sex Age Siblings.and.Spouses Port 一等船票 二等船票 三等船票
## 1 A female 20 1 S 92.75% 80.57% 61.76%
## 2 B male 40 0 Q 21.07% 7.97% 3.26%
## 3 C female 18 3 C 93.27% 81.8% 63.64%
## 4 D female 29 6 Q 43.53% 19.99% 8.87%
## 5 E male 17 1 S 50.78% 25.06% 11.52%
### 练习:计算Jack和Rose的存活率。
tit.pred <- tit[1:2,-c(2,7,8,10)]
tit.pred[1,] <- c("Jack",3,"male",17,0,"S")
tit.pred[2,] <- c("Rose",1,"female",20,1,"S")
tit.pred$Age <- tit.pred$Age %>% as.numeric()
tit.pred$Siblings.and.Spouses <- tit.pred$Siblings.and.Spouses %>% as.numeric()
tit.pred
## Name Passenger.Class Sex Age Siblings.and.Spouses Port
## 1 Jack 3 male 17 0 S
## 2 Rose 1 female 20 1 S
### 预测Jack和Rose的存活率
pred.jack <- predict(fit3,tit.pred[1,],type='response')
pred.rose <- predict(fit3,tit.pred[2,],type='response')
print(paste0("Jack的存活率: ", (pred.jack * 100) %>% round(2), "%"))
## [1] "Jack的存活率: 15.37%"
print(paste0("Rose的存活率: ", (pred.rose * 100) %>% round(2), "%"))
## [1] "Rose的存活率: 92.75%"
tit.pred$存活率 <- paste0((c(pred.jack, pred.rose) * 100) %>% round(2), "%")
tit.pred
## Name Passenger.Class Sex Age Siblings.and.Spouses Port 存活率
## 1 Jack 3 male 17 0 S 15.37%
## 2 Rose 1 female 20 1 S 92.75%