titanic

### 泰坦尼克号乘客生存分析

rm(list = ls())

### 导入包
library(glmnet)

## 载入需要的程辑包：Matrix

## Loaded glmnet 4.1-8

library(dplyr)

## 
## 载入程辑包：'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(GGally)

## Warning: 程辑包'GGally'是用R版本4.3.3 来建造的

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

# 帮助函数，用来辅助画好看的pair plot
my_bin <- function(data, mapping, ..., low = "#132B43", high = "#56B1F7") {
  ggplot(data = data, mapping = mapping) +
    geom_bin2d(...) +
    scale_fill_gradient(low = low, high = high)
}

##############################################################

# 1. 熟悉数据
#### 读取数据 `titanicpassengers-bbm.dat` 到 `tit`

tit = read.csv('titanicpassengers-bbm.dat')
tit = tit %>% mutate(Passenger.Class = factor(Passenger.Class))
dim(tit) #查看数据维度

## [1] 1309   10

names(tit) #查看数据有哪些变量

##  [1] "Name"                 "Survived"             "Passenger.Class"     
##  [4] "Sex"                  "Age"                  "Siblings.and.Spouses"
##  [7] "Parents.and.Children" "Fare"                 "Port"                
## [10] "Home...Destination"

View(tit) #查看数据

data.frame(sapply(tit, function(x) sum(is.na(x)))) #检查数据的缺失情况

##                      sapply.tit..function.x..sum.is.na.x...
## Name                                                      0
## Survived                                                  0
## Passenger.Class                                           0
## Sex                                                       0
## Age                                                     263
## Siblings.and.Spouses                                      0
## Parents.and.Children                                      0
## Fare                                                      1
## Port                                                      0
## Home...Destination                                        0

#### 研究下面3个变量之间的关系：性别，船票等级，和是否存活。
tit.temp = tit %>% mutate(Survived = ifelse(Survived == "Yes",1,0))
tit.temp1 <- tit.temp %>% group_by(Passenger.Class,Sex) %>%
  summarise_at(vars(Survived),list(~ mean(.,na.rm=TRUE)))
rm(tit.temp)

ggplot(tit.temp1, aes(x = Passenger.Class, y = Sex)) +
  geom_point(alpha=0.5, aes(size = Survived, color = Survived)) +
  scale_color_gradientn(colours = rev(terrain.colors(7)[1:5])) +
  geom_text(aes(label = paste0(round(Survived*100,2),"%")),size = 2) +
  scale_size_continuous(range = c(20, 50)) + 
  theme(legend.position = "none") +
  ggtitle("Survival Rate of Passagers")

#### 我们也可以使用 'ggpairs' 来研究变量之间的关系。
tit[,c(2:7,9)] %>% na.omit() %>% 
  ggpairs(progress = FALSE, lower=list(combo=wrap("facethist", binwidth=1), 
                                       continuous=wrap(my_bin, binwidth=0.25))) +
  theme(axis.ticks=element_blank(), 
        axis.line=element_blank())

#### 考虑一部分变量。
tit[,c(2:5,9)] %>% na.omit() %>% 
  ggpairs(progress = FALSE, lower=list(combo=wrap("facethist", binwidth=1), 
                                       continuous=wrap(my_bin, binwidth=0.25))) +
  theme(axis.ticks=element_blank(), 
        axis.line=element_blank())

# 2. 多元逻辑回归

#### 对于逻辑回归模型，因变量是分类变量，我们需要对其进行处理。
tit = tit %>% mutate(Survived = factor(Survived, levels = c("No", "Yes")))
#### 为了避免出现问题，我们将删除所有有缺失值的乘客信息。
tit.comp <- na.omit(tit)
dim(tit.comp)

## [1] 1045   10

## a. 用所有的变量

### i. 全模型：考虑所有的变量
fit1 <- glm(Survived~., tit.comp[,-c(1,10)], family=binomial(logit))
summary(fit1)

## 
## Call:
## glm(formula = Survived ~ ., family = binomial(logit), data = tit.comp[, 
##     -c(1, 10)])
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           1.551e+01  3.707e+02   0.042  0.96662    
## Passenger.Class2     -1.109e+00  2.693e-01  -4.120 3.79e-05 ***
## Passenger.Class3     -2.052e+00  2.778e-01  -7.387 1.50e-13 ***
## Sexmale              -2.613e+00  1.796e-01 -14.550  < 2e-16 ***
## Age                  -3.812e-02  6.724e-03  -5.670 1.43e-08 ***
## Siblings.and.Spouses -3.512e-01  1.087e-01  -3.231  0.00123 ** 
## Parents.and.Children  5.125e-02  1.042e-01   0.492  0.62275    
## Fare                  2.743e-04  1.975e-03   0.139  0.88954    
## PortC                -1.125e+01  3.707e+02  -0.030  0.97578    
## PortQ                -1.270e+01  3.707e+02  -0.034  0.97267    
## PortS                -1.191e+01  3.707e+02  -0.032  0.97436    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1413.57  on 1044  degrees of freedom
## Residual deviance:  954.57  on 1034  degrees of freedom
## AIC: 976.57
## 
## Number of Fisher Scoring iterations: 12

#### 根据上面的结果，`登船港口(Port)`这个变量好像有问题。
table.habor <- data.frame(table(tit.comp$Port))
colnames(table.habor) <- c("Port","频数")
table.habor

##   Port 频数
## 1         2
## 2    C  212
## 3    Q   50
## 4    S  781

#### 根据上面的输出，有两个观测值的登船港口是空的，但是并没有被做为确失值来对待。
#### 我们需要将它们替换成NA。 
tit <- tit %>% mutate(Port = factor(Port, levels=c("S","C","Q")))
tit.comp2 <- na.omit(tit)
dim(tit.comp2)

## [1] 1043   10

#### 重新拟合模型，看和登船港口有关的系数发生了什么变化。
fit2 <- glm(Survived~., tit.comp2[,-c(1,10)], family=binomial(logit))
summary(fit2)

## 
## Call:
## glm(formula = Survived ~ ., family = binomial(logit), data = tit.comp2[, 
##     -c(1, 10)])
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           3.5950512  0.4111480   8.744  < 2e-16 ***
## Passenger.Class2     -1.1093557  0.2692636  -4.120 3.79e-05 ***
## Passenger.Class3     -2.0519804  0.2777922  -7.387 1.50e-13 ***
## Sexmale              -2.6128646  0.1795723 -14.550  < 2e-16 ***
## Age                  -0.0381233  0.0067240  -5.670 1.43e-08 ***
## Siblings.and.Spouses -0.3512240  0.1086969  -3.231  0.00123 ** 
## Parents.and.Children  0.0512518  0.1041801   0.492  0.62275    
## Fare                  0.0002743  0.0019752   0.139  0.88954    
## PortC                 0.6620573  0.2154290   3.073  0.00212 ** 
## PortQ                -0.7864951  0.4091805  -1.922  0.05459 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1409.99  on 1042  degrees of freedom
## Residual deviance:  954.57  on 1033  degrees of freedom
## AIC: 974.57
## 
## Number of Fisher Scoring iterations: 5

#### 删除不显著的自变量`船上父母孩子数量(Siblings.and.Spouses)` 和 `船票价格(Fare)`。
fit3 <- glm(Survived~., tit.comp2[,-c(1,7:8,10)], family=binomial(logit))
summary(fit3)

## 
## Call:
## glm(formula = Survived ~ ., family = binomial(logit), data = tit.comp2[, 
##     -c(1, 7:8, 10)])
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           3.647197   0.377513   9.661  < 2e-16 ***
## Passenger.Class2     -1.126422   0.243779  -4.621 3.82e-06 ***
## Passenger.Class3     -2.069269   0.238929  -8.661  < 2e-16 ***
## Sexmale              -2.632629   0.176375 -14.926  < 2e-16 ***
## Age                  -0.038306   0.006712  -5.707 1.15e-08 ***
## Siblings.and.Spouses -0.332316   0.103047  -3.225  0.00126 ** 
## PortC                 0.668459   0.212694   3.143  0.00167 ** 
## PortQ                -0.802769   0.408617  -1.965  0.04946 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1409.99  on 1042  degrees of freedom
## Residual deviance:  954.88  on 1035  degrees of freedom
## AIC: 970.88
## 
## Number of Fisher Scoring iterations: 5

### ii. 是否需要这两个不显著的自变量`船上父母孩子数量`和`船票价格`？

#### 我们可以使用似然比检验，来判断是不是需要某些变量。
#### 原假设是小的模型是足够的，也就是说必须要这两个变量。这个检验和线性回归中的方差分析是类似的。
anova(fit3,fit2)

## Analysis of Deviance Table
## 
## Model 1: Survived ~ Passenger.Class + Sex + Age + Siblings.and.Spouses + 
##     Port
## Model 2: Survived ~ Passenger.Class + Sex + Age + Siblings.and.Spouses + 
##     Parents.and.Children + Fare + Port
##   Resid. Df Resid. Dev Df Deviance
## 1      1035     954.88            
## 2      1033     954.57  2   0.3119

#### 我们也可以直接来计算卡方统计量，来做上述检验。
chi.sq.2 <- fit3$deviance-fit2$deviance
chi.sq.2

## [1] 0.3119023

pchisq(chi.sq.2, 2, lower.tail=FALSE)

## [1] 0.855601

#### 以上两个分析都说明我们只需要小的模型。

### b. 最后的模型
fit3 <- glm(Survived~., tit.comp2[,-c(1,7:8,10)], family=binomial(logit))
summary(fit3)

## 
## Call:
## glm(formula = Survived ~ ., family = binomial(logit), data = tit.comp2[, 
##     -c(1, 7:8, 10)])
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           3.647197   0.377513   9.661  < 2e-16 ***
## Passenger.Class2     -1.126422   0.243779  -4.621 3.82e-06 ***
## Passenger.Class3     -2.069269   0.238929  -8.661  < 2e-16 ***
## Sexmale              -2.632629   0.176375 -14.926  < 2e-16 ***
## Age                  -0.038306   0.006712  -5.707 1.15e-08 ***
## Siblings.and.Spouses -0.332316   0.103047  -3.225  0.00126 ** 
## PortC                 0.668459   0.212694   3.143  0.00167 ** 
## PortQ                -0.802769   0.408617  -1.965  0.04946 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1409.99  on 1042  degrees of freedom
## Residual deviance:  954.88  on 1035  degrees of freedom
## AIC: 970.88
## 
## Number of Fisher Scoring iterations: 5

#### 回归分析的系数可以用赔率比来解释。

### 练习：男性和女性的存活率差别？船票等级不同对应的差别？

#### 男女性存活率的差别
df.gender <- data.frame(tit[1:5,-c(2,4,7,8,10)])
colnames(df.gender)[1] <- "Case"
df.gender$Case <- c("A","B","C","D","E")
df.gender$Passenger.Class <- c(1,1,2,2,3) %>% factor()
df.gender$Age <- c(20,40,55,29,17)
df.gender$Siblings.and.Spouses <- c(1,0,3,6,1)
df.gender$Port <- c("S","Q","C","Q","S") %>% factor(levels=c("S","C","Q"))
df.gender

##   Case Passenger.Class Age Siblings.and.Spouses Port
## 1    A               1  20                    1    S
## 2    B               1  40                    0    Q
## 3    C               2  55                    3    C
## 4    D               2  29                    6    Q
## 5    E               3  17                    1    S

for (i in 1:5){
  df.gender[i,"女性"] <- predict(fit3,c(df.gender[i,],Sex="female"),type='response')}
for (i in 1:5){
  df.gender[i,"男性"] <- predict(fit3,c(df.gender[i,],Sex="male"),type='response')}
df.gender$差值 <- df.gender$女性 - df.gender$男性
df.gender$女性<- paste0((df.gender$女性*100)%>% round(2),"%")
df.gender$男性 <- paste0((df.gender$男性*100)%>% round(2),"%")
df.gender$差值 <- paste0((df.gender$差值*100)%>% round(2),"%")
row.names(df.gender) <- NULL
df.gender

##   Case Passenger.Class Age Siblings.and.Spouses Port   女性   男性   差值
## 1    A               1  20                    1    S 92.75%  47.9% 44.84%
## 2    B               1  40                    0    Q 78.79% 21.07% 57.71%
## 3    C               2  55                    3    C 52.14%  7.26% 44.87%
## 4    D               2  29                    6    Q 19.99%  1.76% 18.23%
## 5    E               3  17                    1    S 64.44% 11.52% 52.91%

#### 不同船票等级的存活率差别
df.class <- data.frame(tit[1:5,-c(2,3,7,8,10)])
colnames(df.class)[1] <- "Case"
df.class$Case <- c("A","B","C","D","E")
df.class$Sex <- c("female","male","female","female","male") %>% factor()
df.class$Age <- c(20,40,18,29,17)
df.class$Siblings.and.Spouses <- c(1,0,3,6,1)
df.class$Port <- c("S","Q","C","Q","S") %>% factor(levels=c("S","C","Q"))

df.class$Passenger.Class <- rep(1,5) %>% factor(levels=c(1,2,3))
for (i in 1:5){df.class[i,"一等船票"] <- predict(fit3,df.class[i,],type='response')}
df.class$Passenger.Class <- rep(2,5) %>% factor(levels=c(1,2,3))
for (i in 1:5){df.class[i,"二等船票"] <- predict(fit3,df.class[i,],type='response')}
df.class$Passenger.Class <- rep(3,5) %>% factor(levels=c(1,2,3))
for (i in 1:5){df.class[i,"三等船票"] <- predict(fit3,df.class[i,],type='response')}
df.class <- df.class[,-6]
df.class$一等船票 <- paste0((df.class$一等船票 * 100) %>% round(2), "%")
df.class$二等船票 <- paste0((df.class$二等船票 * 100) %>% round(2), "%")
df.class$三等船票 <- paste0((df.class$三等船票 * 100) %>% round(2), "%")
row.names(df.class) <- NULL
df.class

##   Case    Sex Age Siblings.and.Spouses Port 一等船票 二等船票 三等船票
## 1    A female  20                    1    S   92.75%   80.57%   61.76%
## 2    B   male  40                    0    Q   21.07%    7.97%    3.26%
## 3    C female  18                    3    C   93.27%    81.8%   63.64%
## 4    D female  29                    6    Q   43.53%   19.99%    8.87%
## 5    E   male  17                    1    S   50.78%   25.06%   11.52%

### 练习：计算Jack和Rose的存活率。
tit.pred <- tit[1:2,-c(2,7,8,10)]
tit.pred[1,] <- c("Jack",3,"male",17,0,"S")
tit.pred[2,] <- c("Rose",1,"female",20,1,"S")
tit.pred$Age <- tit.pred$Age  %>% as.numeric()
tit.pred$Siblings.and.Spouses <- tit.pred$Siblings.and.Spouses  %>% as.numeric()
tit.pred

##   Name Passenger.Class    Sex Age Siblings.and.Spouses Port
## 1 Jack               3   male  17                    0    S
## 2 Rose               1 female  20                    1    S

### 预测Jack和Rose的存活率
pred.jack <- predict(fit3,tit.pred[1,],type='response')
pred.rose <- predict(fit3,tit.pred[2,],type='response')
print(paste0("Jack的存活率: ", (pred.jack * 100) %>% round(2), "%"))

## [1] "Jack的存活率: 15.37%"

print(paste0("Rose的存活率: ", (pred.rose * 100) %>% round(2), "%"))

## [1] "Rose的存活率: 92.75%"

tit.pred$存活率 <- paste0((c(pred.jack, pred.rose) * 100) %>% round(2), "%")
tit.pred

##   Name Passenger.Class    Sex Age Siblings.and.Spouses Port 存活率
## 1 Jack               3   male  17                    0    S 15.37%
## 2 Rose               1 female  20                    1    S 92.75%

titanic

dx

2024-03-10