knitr::opts_chunk$set(warning = FALSE,echo = TRUE,message = FALSE)
Kaplan-Meier survival curves.(Visually observe how long customers typically use a specific telecommunications service before churn.)
Stratified survival curves and Cox regression were used to examine the effect of different covariates on attrition.
Parametric model (Weibull, lognormal, exponential,gaussian) model to fit the survival model.
library(ggplot2)
library("survival")
library("survminer")
dat=read.csv("C:/Users/13587/Desktop/kagge/churn-bigml-80.csv")
#head(dat)
str(dat)
## 'data.frame': 2666 obs. of 20 variables:
## $ State : chr "KS" "OH" "NJ" "OH" ...
## $ Account.length : int 128 107 137 84 75 118 121 147 141 74 ...
## $ Area.code : int 415 415 415 408 415 510 510 415 415 415 ...
## $ International.plan : chr "No" "No" "No" "Yes" ...
## $ Voice.mail.plan : chr "Yes" "Yes" "No" "No" ...
## $ Number.vmail.messages : int 25 26 0 0 0 0 24 0 37 0 ...
## $ Total.day.minutes : num 265 162 243 299 167 ...
## $ Total.day.calls : int 110 123 114 71 113 98 88 79 84 127 ...
## $ Total.day.charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ Total.eve.minutes : num 197.4 195.5 121.2 61.9 148.3 ...
## $ Total.eve.calls : int 99 103 110 88 122 101 108 94 111 148 ...
## $ Total.eve.charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ Total.night.minutes : num 245 254 163 197 187 ...
## $ Total.night.calls : int 91 103 104 89 121 118 118 96 97 94 ...
## $ Total.night.charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ Total.intl.minutes : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 11.2 9.1 ...
## $ Total.intl.calls : int 3 3 5 7 3 6 7 6 5 5 ...
## $ Total.intl.charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 3.02 2.46 ...
## $ Customer.service.calls: int 1 1 0 2 3 0 3 0 0 0 ...
## $ Churn : chr "False" "False" "False" "False" ...
summary(dat)
## State Account.length Area.code International.plan
## Length:2666 Min. : 1.0 Min. :408.0 Length:2666
## Class :character 1st Qu.: 73.0 1st Qu.:408.0 Class :character
## Mode :character Median :100.0 Median :415.0 Mode :character
## Mean :100.6 Mean :437.4
## 3rd Qu.:127.0 3rd Qu.:510.0
## Max. :243.0 Max. :510.0
## Voice.mail.plan Number.vmail.messages Total.day.minutes Total.day.calls
## Length:2666 Min. : 0.000 Min. : 0.0 Min. : 0.0
## Class :character 1st Qu.: 0.000 1st Qu.:143.4 1st Qu.: 87.0
## Mode :character Median : 0.000 Median :179.9 Median :101.0
## Mean : 8.022 Mean :179.5 Mean :100.3
## 3rd Qu.:19.000 3rd Qu.:215.9 3rd Qu.:114.0
## Max. :50.000 Max. :350.8 Max. :160.0
## Total.day.charge Total.eve.minutes Total.eve.calls Total.eve.charge
## Min. : 0.00 Min. : 0.0 Min. : 0 Min. : 0.00
## 1st Qu.:24.38 1st Qu.:165.3 1st Qu.: 87 1st Qu.:14.05
## Median :30.59 Median :200.9 Median :100 Median :17.08
## Mean :30.51 Mean :200.4 Mean :100 Mean :17.03
## 3rd Qu.:36.70 3rd Qu.:235.1 3rd Qu.:114 3rd Qu.:19.98
## Max. :59.64 Max. :363.7 Max. :170 Max. :30.91
## Total.night.minutes Total.night.calls Total.night.charge Total.intl.minutes
## Min. : 43.7 Min. : 33.0 Min. : 1.970 Min. : 0.00
## 1st Qu.:166.9 1st Qu.: 87.0 1st Qu.: 7.513 1st Qu.: 8.50
## Median :201.2 Median :100.0 Median : 9.050 Median :10.20
## Mean :201.2 Mean :100.1 Mean : 9.053 Mean :10.24
## 3rd Qu.:236.5 3rd Qu.:113.0 3rd Qu.:10.640 3rd Qu.:12.10
## Max. :395.0 Max. :166.0 Max. :17.770 Max. :20.00
## Total.intl.calls Total.intl.charge Customer.service.calls Churn
## Min. : 0.000 Min. :0.000 Min. :0.000 Length:2666
## 1st Qu.: 3.000 1st Qu.:2.300 1st Qu.:1.000 Class :character
## Median : 4.000 Median :2.750 Median :1.000 Mode :character
## Mean : 4.467 Mean :2.764 Mean :1.563
## 3rd Qu.: 6.000 3rd Qu.:3.270 3rd Qu.:2.000
## Max. :20.000 Max. :5.400 Max. :9.000
#select rows with NA values in any column
na_rows <- dat[!complete.cases(dat), ]
na_rows
## [1] State Account.length Area.code
## [4] International.plan Voice.mail.plan Number.vmail.messages
## [7] Total.day.minutes Total.day.calls Total.day.charge
## [10] Total.eve.minutes Total.eve.calls Total.eve.charge
## [13] Total.night.minutes Total.night.calls Total.night.charge
## [16] Total.intl.minutes Total.intl.calls Total.intl.charge
## [19] Customer.service.calls Churn
## <0 行> (或0-长度的row.names)
#counting na
sum(is.na(dat))
## [1] 0
dat$Churn <- ifelse(dat$Churn == "True", 1, 0)
#histograms of each cols
col_names=colnames(dat)
# Set the size of the plot
par(mfrow = c(4, 5), mar = c(4, 4, 2, 1)) # Adjust the 'mar' parameter to control margins
# Loop through each column and create histograms for numeric columns
for (i in 1:ncol(dat)) {
if (is.numeric(dat[[i]])) {
hist(dat[[i]], main = paste(col_names[i]), xlab = col_names[i])
box()
}
}
# observe categorical variable
par(mfrow=c(1, 4))
for (i in 1:ncol(dat)) {
if (is.character(dat[[i]])) {
table_data <- table(dat[[i]])
#pie(table_data, main = paste("Pie Chart of", colnames(dat)[i]), labels = table_data, col = rainbow(length(table_data)))
barplot(table_data, main=paste(colnames(dat)[i]), xlab=table_data, ylab="Frequency")
}
}
library(DataExplorer)
#missing value and visualization distributions for all continuous features:
plot_intro(dat)
plot_missing(dat, group=c("Good"=1.0), theme_config=list(text = element_text(size = 16)))
plot_histogram(dat)
plot_correlation(na.omit(dat), maxcat = 5L)
# type = "c"/"d" for only discrete/continuous features
dat$Account.length=as.numeric(dat$Account.length)
#str(dat)
fit <- survfit(Surv(Account.length, Churn) ~ 1, data = dat)
print(fit)
## Call: survfit(formula = Surv(Account.length, Churn) ~ 1, data = dat)
##
## n events median 0.95LCL 0.95UCL
## [1,] 2666 388 197 181 NA
# Summary of survival curves
#summary(fit)
ggsurvplot(fit,
xlab="Days",
ylab="Survival Probability",
risk.table=TRUE,
conf.int=TRUE,
surv.median.line="hv",
title = "Survival curves") # draw horizontal AND vertical line for median
#Cumulative risk curve/event
##As time (t) increases, the cumulative risk increases, reflecting the growing likelihood of experiencing the event.\
ggsurvplot(fit,xlab="Days",
fun = "cumhaz", #"event"
conf.int = TRUE, # confidence Interval
palette = "lancet",
ggtheme = theme_bw() ,title = "Survival curves"
)
#This plot shows the number of events (survival events or failures) at each time point, reflecting how the events are distributed over time.
ggsurvplot(fit,xlab="Days",
fun = "event",
conf.int = TRUE, # confidence Interval
palette = "lancet",
ggtheme = theme_bw() ,title = "Survival curves"
)
table(dat$Churn)
##
## 0 1
## 2278 388
fit1 <- survfit(Surv(Account.length, Churn) ~ strata(International.plan), data = dat)
print(fit1)
## Call: survfit(formula = Surv(Account.length, Churn) ~ strata(International.plan),
## data = dat)
##
## n events median 0.95LCL 0.95UCL
## strata(International.plan)=No 2396 270 212 201 NA
## strata(International.plan)=Yes 270 118 133 125 147
fit2 <- survfit(Surv(Account.length, Churn) ~ strata(Voice.mail.plan), data = dat)
print(fit2)
## Call: survfit(formula = Surv(Account.length, Churn) ~ strata(Voice.mail.plan),
## data = dat)
##
## n events median 0.95LCL 0.95UCL
## strata(Voice.mail.plan)=No 1933 323 181 173 NA
## strata(Voice.mail.plan)=Yes 733 65 NA NA NA
# Visualize (KM plot) effect of group
ggsurvplot(survfit(Surv(Account.length, Churn) ~ International.plan, data = dat),
pval = TRUE, # displays p-value of log-rank test of the difference between the two curves
conf.int = TRUE,
risk.table = TRUE, # Add risk table
surv.median.line = "hv", # Specify median survival
ggtheme = theme_bw(), # Change ggplot2 theme
palette = c("#E7B800", "#2E9FDF"),
title = "KM Curve for telecom churn Survival by International.plan"
)
ggsurvplot(survfit(Surv(Account.length, Churn) ~ Voice.mail.plan, data = dat),
pval = TRUE, conf.int = TRUE,
risk.table = TRUE, # Add risk table
surv.median.line = "hv", # Specify median survival
ggtheme = theme_bw(), # Change ggplot2 theme
palette = c("#E7B800", "#2E9FDF"),
title = "KM Curve for telecom churn Survival by Voice.mail.plan"
)
#Cumulative risk curve/event
ggsurvplot(fit1,
fun = "cumhaz", #"event"
conf.int = TRUE, # confidence Interval
palette = "lancet",
ggtheme = theme_bw()
)
ggsurvplot(fit2,
fun = "cumhaz", #"event"
conf.int = TRUE, # confidence Interval
palette = "lancet",
ggtheme = theme_bw()
)
# differences between groups with small p values-significant
surv_diff1 <- survdiff(Surv(Account.length, Churn) ~ International.plan, data = dat)
surv_diff1
## Call:
## survdiff(formula = Surv(Account.length, Churn) ~ International.plan,
## data = dat)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## International.plan=No 2396 270 346.7 17 160
## International.plan=Yes 270 118 41.3 143 160
##
## Chisq= 160 on 1 degrees of freedom, p= <2e-16
surv_diff2 <- survdiff(Surv(Account.length, Churn) ~ Voice.mail.plan, data = dat)
surv_diff2
## Call:
## survdiff(formula = Surv(Account.length, Churn) ~ Voice.mail.plan,
## data = dat)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## Voice.mail.plan=No 1933 323 283 5.62 20.9
## Voice.mail.plan=Yes 733 65 105 15.17 20.9
##
## Chisq= 20.9 on 1 degrees of freedom, p= 5e-06
Low p-values (<0.05 95%CI) indicate that a variable is
statistically significant in predicting the event. In the model,
StateCT,MA,MT,NJ,SC,TX;International.planYes; Voice.mail.planYes;
Number.vmail.messages;otal.intl.calls; Customer.service.calls are
significant features.
The Log-Likelihood Ratio Test tests the overall significance of
the model. Here the p-value (=<2e-16) indicates that the model fits
the data set well.
Coef provides log hazard ratios for each variable. Exp(coef)
represents the estimated hazard ratio. If exp(coef) is greater than 1,
it indicates an increased risk of the event (Churn) happening for the
given variable. If it’s less than 1, it indicates a decreased
risk.
cox_model <- coxph(formula = Surv(Account.length, Churn) ~ ., data = dat)
#print(cox_model)
summary(cox_model)
## Call:
## coxph(formula = Surv(Account.length, Churn) ~ ., data = dat)
##
## n= 2666, number of events= 388
##
## coef exp(coef) se(coef) z Pr(>|z|)
## StateAL 2.376e-01 1.268e+00 6.976e-01 0.341 0.733444
## StateAR 7.992e-01 2.224e+00 6.617e-01 1.208 0.227152
## StateAZ 5.357e-01 1.709e+00 8.238e-01 0.650 0.515557
## StateCA 1.161e+00 3.192e+00 7.407e-01 1.567 0.117141
## StateCO 2.297e-01 1.258e+00 6.967e-01 0.330 0.741673
## StateCT 1.378e+00 3.969e+00 6.565e-01 2.100 0.035759 *
## StateDC 6.936e-01 2.001e+00 7.384e-01 0.939 0.347546
## StateDE 8.409e-01 2.318e+00 6.895e-01 1.220 0.222645
## StateFL 6.429e-01 1.902e+00 6.966e-01 0.923 0.356055
## StateGA 9.414e-01 2.564e+00 6.863e-01 1.372 0.170144
## StateHI -4.585e-01 6.323e-01 9.192e-01 -0.499 0.617961
## StateIA 7.198e-01 2.054e+00 8.249e-01 0.873 0.382887
## StateID 6.009e-01 1.824e+00 7.382e-01 0.814 0.415616
## StateIL 1.726e-01 1.188e+00 7.697e-01 0.224 0.822580
## StateIN 7.108e-01 2.036e+00 7.147e-01 0.995 0.319911
## StateKS 9.227e-01 2.516e+00 6.654e-01 1.387 0.165504
## StateKY 1.263e+00 3.535e+00 7.176e-01 1.760 0.078447 .
## StateLA 5.170e-01 1.677e+00 8.249e-01 0.627 0.530830
## StateMA 1.463e+00 4.319e+00 6.852e-01 2.135 0.032740 *
## StateMD 9.959e-01 2.707e+00 6.453e-01 1.543 0.122729
## StateME 1.286e+00 3.619e+00 6.657e-01 1.932 0.053360 .
## StateMI 1.090e+00 2.974e+00 6.491e-01 1.679 0.093185 .
## StateMN 5.193e-01 1.681e+00 6.636e-01 0.783 0.433872
## StateMO 3.423e-01 1.408e+00 7.410e-01 0.462 0.644147
## StateMS 1.216e+00 3.372e+00 6.603e-01 1.841 0.065631 .
## StateMT 1.503e+00 4.494e+00 6.676e-01 2.251 0.024397 *
## StateNC 6.060e-02 1.062e+00 6.833e-01 0.089 0.929327
## StateND 2.079e-01 1.231e+00 7.721e-01 0.269 0.787720
## StateNE 4.334e-01 1.543e+00 7.771e-01 0.558 0.576995
## StateNH 1.146e+00 3.145e+00 6.751e-01 1.697 0.089699 .
## StateNJ 1.542e+00 4.675e+00 6.472e-01 2.383 0.017183 *
## StateNM 1.960e-01 1.216e+00 7.701e-01 0.254 0.799131
## StateNV 1.101e+00 3.008e+00 6.469e-01 1.702 0.088696 .
## StateNY 9.059e-01 2.474e+00 6.529e-01 1.387 0.165290
## StateOH 1.029e+00 2.798e+00 6.683e-01 1.540 0.123652
## StateOK 3.864e-01 1.472e+00 7.014e-01 0.551 0.581702
## StateOR 5.001e-01 1.649e+00 6.995e-01 0.715 0.474671
## StatePA 1.270e+00 3.562e+00 6.809e-01 1.866 0.062093 .
## StateRI -2.472e-01 7.810e-01 8.223e-01 -0.301 0.763708
## StateSC 1.363e+00 3.909e+00 6.670e-01 2.044 0.040990 *
## StateSD 4.641e-01 1.591e+00 7.130e-01 0.651 0.515086
## StateTN 1.046e+00 2.847e+00 7.368e-01 1.420 0.155650
## StateTX 1.487e+00 4.425e+00 6.366e-01 2.336 0.019473 *
## StateUT 8.259e-01 2.284e+00 6.852e-01 1.205 0.228095
## StateVA -4.849e-01 6.158e-01 7.708e-01 -0.629 0.529285
## StateVT 2.434e-01 1.276e+00 7.179e-01 0.339 0.734522
## StateWA 1.113e+00 3.042e+00 6.656e-01 1.672 0.094600 .
## StateWI 2.961e-01 1.345e+00 7.701e-01 0.384 0.700630
## StateWV 5.580e-01 1.747e+00 6.968e-01 0.801 0.423244
## StateWY 3.019e-01 1.352e+00 6.853e-01 0.441 0.659570
## Area.code 1.244e-03 1.001e+00 1.256e-03 0.991 0.321847
## International.planYes 1.358e+00 3.888e+00 1.210e-01 11.226 < 2e-16 ***
## Voice.mail.planYes -2.346e+00 9.573e-02 5.941e-01 -3.949 7.83e-05 ***
## Number.vmail.messages 4.924e-02 1.050e+00 1.830e-02 2.690 0.007142 **
## Total.day.minutes -1.235e+00 2.907e-01 3.095e+00 -0.399 0.689805
## Total.day.calls 5.626e-04 1.001e+00 2.647e-03 0.213 0.831647
## Total.day.charge 7.321e+00 1.511e+03 1.821e+01 0.402 0.687625
## Total.eve.minutes 5.589e-01 1.749e+00 1.591e+00 0.351 0.725321
## Total.eve.calls 1.203e-03 1.001e+00 2.665e-03 0.452 0.651553
## Total.eve.charge -6.537e+00 1.449e-03 1.872e+01 -0.349 0.726888
## Total.night.minutes -3.034e-01 7.383e-01 8.676e-01 -0.350 0.726597
## Total.night.calls 2.063e-03 1.002e+00 2.777e-03 0.743 0.457643
## Total.night.charge 6.782e+00 8.818e+02 1.928e+01 0.352 0.724997
## Total.intl.minutes -5.445e+00 4.318e-03 5.080e+00 -1.072 0.283767
## Total.intl.calls -9.096e-02 9.131e-01 2.568e-02 -3.542 0.000397 ***
## Total.intl.charge 2.036e+01 6.929e+08 1.881e+01 1.082 0.279193
## Customer.service.calls 3.314e-01 1.393e+00 3.309e-02 10.013 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## StateAL 1.268e+00 7.885e-01 3.231e-01 4.977e+00
## StateAR 2.224e+00 4.497e-01 6.079e-01 8.134e+00
## StateAZ 1.709e+00 5.853e-01 3.399e-01 8.588e+00
## StateCA 3.192e+00 3.133e-01 7.474e-01 1.363e+01
## StateCO 1.258e+00 7.948e-01 3.211e-01 4.930e+00
## StateCT 3.969e+00 2.520e-01 1.096e+00 1.437e+01
## StateDC 2.001e+00 4.998e-01 4.707e-01 8.507e+00
## StateDE 2.318e+00 4.313e-01 6.002e-01 8.956e+00
## StateFL 1.902e+00 5.257e-01 4.856e-01 7.451e+00
## StateGA 2.564e+00 3.901e-01 6.679e-01 9.840e+00
## StateHI 6.323e-01 1.582e+00 1.043e-01 3.831e+00
## StateIA 2.054e+00 4.868e-01 4.078e-01 1.035e+01
## StateID 1.824e+00 5.483e-01 4.292e-01 7.750e+00
## StateIL 1.188e+00 8.415e-01 2.629e-01 5.372e+00
## StateIN 2.036e+00 4.912e-01 5.016e-01 8.261e+00
## StateKS 2.516e+00 3.974e-01 6.829e-01 9.271e+00
## StateKY 3.535e+00 2.829e-01 8.662e-01 1.443e+01
## StateLA 1.677e+00 5.963e-01 3.330e-01 8.446e+00
## StateMA 4.319e+00 2.315e-01 1.128e+00 1.655e+01
## StateMD 2.707e+00 3.694e-01 7.643e-01 9.590e+00
## StateME 3.619e+00 2.763e-01 9.816e-01 1.334e+01
## StateMI 2.974e+00 3.363e-01 8.332e-01 1.061e+01
## StateMN 1.681e+00 5.949e-01 4.578e-01 6.171e+00
## StateMO 1.408e+00 7.102e-01 3.296e-01 6.016e+00
## StateMS 3.372e+00 2.966e-01 9.244e-01 1.230e+01
## StateMT 4.494e+00 2.225e-01 1.214e+00 1.663e+01
## StateNC 1.062e+00 9.412e-01 2.784e-01 4.054e+00
## StateND 1.231e+00 8.123e-01 2.711e-01 5.591e+00
## StateNE 1.543e+00 6.483e-01 3.364e-01 7.074e+00
## StateNH 3.145e+00 3.180e-01 8.373e-01 1.181e+01
## StateNJ 4.675e+00 2.139e-01 1.315e+00 1.662e+01
## StateNM 1.216e+00 8.220e-01 2.689e-01 5.503e+00
## StateNV 3.008e+00 3.325e-01 8.465e-01 1.069e+01
## StateNY 2.474e+00 4.042e-01 6.881e-01 8.896e+00
## StateOH 2.798e+00 3.574e-01 7.551e-01 1.037e+01
## StateOK 1.472e+00 6.795e-01 3.722e-01 5.819e+00
## StateOR 1.649e+00 6.065e-01 4.186e-01 6.495e+00
## StatePA 3.562e+00 2.807e-01 9.378e-01 1.353e+01
## StateRI 7.810e-01 1.280e+00 1.558e-01 3.914e+00
## StateSC 3.909e+00 2.558e-01 1.057e+00 1.445e+01
## StateSD 1.591e+00 6.287e-01 3.932e-01 6.434e+00
## StateTN 2.847e+00 3.513e-01 6.717e-01 1.206e+01
## StateTX 4.425e+00 2.260e-01 1.271e+00 1.541e+01
## StateUT 2.284e+00 4.378e-01 5.962e-01 8.749e+00
## StateVA 6.158e-01 1.624e+00 1.359e-01 2.789e+00
## StateVT 1.276e+00 7.839e-01 3.124e-01 5.209e+00
## StateWA 3.042e+00 3.287e-01 8.254e-01 1.121e+01
## StateWI 1.345e+00 7.437e-01 2.972e-01 6.083e+00
## StateWV 1.747e+00 5.724e-01 4.459e-01 6.845e+00
## StateWY 1.352e+00 7.394e-01 3.530e-01 5.181e+00
## Area.code 1.001e+00 9.988e-01 9.988e-01 1.004e+00
## International.planYes 3.888e+00 2.572e-01 3.067e+00 4.929e+00
## Voice.mail.planYes 9.573e-02 1.045e+01 2.988e-02 3.067e-01
## Number.vmail.messages 1.050e+00 9.520e-01 1.013e+00 1.089e+00
## Total.day.minutes 2.907e-01 3.440e+00 6.744e-04 1.253e+02
## Total.day.calls 1.001e+00 9.994e-01 9.954e-01 1.006e+00
## Total.day.charge 1.511e+03 6.618e-04 4.806e-13 4.750e+18
## Total.eve.minutes 1.749e+00 5.718e-01 7.739e-02 3.952e+01
## Total.eve.calls 1.001e+00 9.988e-01 9.960e-01 1.006e+00
## Total.eve.charge 1.449e-03 6.900e+02 1.701e-19 1.235e+13
## Total.night.minutes 7.383e-01 1.354e+00 1.348e-01 4.043e+00
## Total.night.calls 1.002e+00 9.979e-01 9.966e-01 1.008e+00
## Total.night.charge 8.818e+02 1.134e-03 3.431e-14 2.267e+19
## Total.intl.minutes 4.318e-03 2.316e+02 2.048e-07 9.103e+01
## Total.intl.calls 9.131e-01 1.095e+00 8.682e-01 9.602e-01
## Total.intl.charge 6.929e+08 1.443e-09 6.736e-08 7.128e+24
## Customer.service.calls 1.393e+00 7.179e-01 1.305e+00 1.486e+00
##
## Concordance= 0.784 (se = 0.013 )
## Likelihood ratio test= 437.9 on 67 df, p=<2e-16
## Wald test = 428.1 on 67 df, p=<2e-16
## Score (logrank) test = 481.7 on 67 df, p=<2e-16
cox1 = coxph(Surv(Account.length, Churn) ~ International.plan, data = dat)
print(cox1)
## Call:
## coxph(formula = Surv(Account.length, Churn) ~ International.plan,
## data = dat)
##
## coef exp(coef) se(coef) z p
## International.planYes 1.3072 3.6959 0.1105 11.82 <2e-16
##
## Likelihood ratio test=113.6 on 1 df, p=< 2.2e-16
## n= 2666, number of events= 388
cox2 = coxph(Surv(Account.length, Churn) ~ Voice.mail.plan, data = dat)
print(cox2)
## Call:
## coxph(formula = Surv(Account.length, Churn) ~ Voice.mail.plan,
## data = dat)
##
## coef exp(coef) se(coef) z p
## Voice.mail.planYes -0.6129 0.5418 0.1361 -4.504 6.67e-06
##
## Likelihood ratio test=23.09 on 1 df, p=1.549e-06
## n= 2666, number of events= 388
# Set up Surv() object
survdat <- Surv(time = dat$Account.length, event = dat$Churn)
# Exponential model
fit_exp <- survreg(survdat ~ 1, dist = "exponential")
lambda <- 1 / exp(fit_exp$coef)
summary(fit_exp)
##
## Call:
## survreg(formula = survdat ~ 1, dist = "exponential")
## Value Std. Error z p
## (Intercept) 6.5387 0.0508 129 <2e-16
##
## Scale fixed at 1
##
## Exponential distribution
## Loglik(model)= -2925 Loglik(intercept only)= -2925
## Number of Newton-Raphson Iterations: 5
## n= 2666
#Weibull model
fit_weibull <- survreg(survdat ~ 1, dist = "weibull")
alpha <- 1 / fit_weibull$scale
beta <- exp(fit_weibull$coef)
summary(fit_weibull)
##
## Call:
## survreg(formula = survdat ~ 1, dist = "weibull")
## Value Std. Error z p
## (Intercept) 5.4204 0.0284 190.8 <2e-16
## Log(scale) -1.0240 0.0400 -25.6 <2e-16
##
## Scale= 0.359
##
## Weibull distribution
## Loglik(model)= -2719.6 Loglik(intercept only)= -2719.6
## Number of Newton-Raphson Iterations: 10
## n= 2666
#Log-Normal
fit_lognormal <- survreg(survdat ~ 1, dist = "lognormal")
mu_lognormal <- fit_lognormal$coefficients
sigma_lognormal <- fit_lognormal$scale
summary(fit_lognormal)
##
## Call:
## survreg(formula = survdat ~ 1, dist = "lognormal")
## Value Std. Error z p
## (Intercept) 5.5446 0.0408 135.76 < 2e-16
## Log(scale) -0.2569 0.0372 -6.91 4.8e-12
##
## Scale= 0.773
##
## Log Normal distribution
## Loglik(model)= -2771.2 Loglik(intercept only)= -2771.2
## Number of Newton-Raphson Iterations: 5
## n= 2666
# Gaussian
fit_gaussian <- survreg(survdat ~ 1, dist = "gaussian")
mean_gaussian <- fit_gaussian$coefficients
scale_gaussian <- fit_gaussian$scale
summary(fit_gaussian)
##
## Call:
## survreg(formula = survdat ~ 1, dist = "gaussian")
## Value Std. Error z p
## (Intercept) 186.6159 3.2769 57 <2e-16
## Log(scale) 4.1546 0.0361 115 <2e-16
##
## Scale= 63.7
##
## Gaussian distribution
## Loglik(model)= -2716 Loglik(intercept only)= -2716
## Number of Newton-Raphson Iterations: 5
## n= 2666
#plot
tvec <- seq(0, 200, length = 201)
plot(tvec, dexp(tvec, lambda), type = "l", ylim = c(0, 0.004), xlab = "Time", ylab = "Density", col = "black", main = "Survival Model Comparison")
curve(dlnorm(x, meanlog = mu_lognormal, sdlog = sigma_lognormal), add = TRUE, col = "red", lty = 2)
curve(dweibull(x, alpha, beta), add = TRUE, col = "blue", lty = 3)
curve(dnorm(x, mean = mean_gaussian, sd = scale_gaussian), add = TRUE, col = "green", lty = 4)
# Add a legend
legend("topright", legend = c("Exponential", "Log-Normal", "Weibull", "Gaussian"), col = c("black", "red", "blue", "green"), lty = c(1, 2, 3, 4))
#AIC,BIC to find "best" fit
AIC_exp=AIC(fit_exp);BIC_exp=BIC(fit_exp)
AIC_weibull=AIC(fit_weibull);BIC_weibull=BIC(fit_weibull)
AIC_lognormal=AIC(fit_lognormal);BIC_lognormal=BIC(fit_lognormal)
AIC_gaussian=AIC(fit_gaussian);BIC_gaussian=BIC(fit_gaussian)
# Create a table
model_names <- c("Exponential", "Weibull", "Log-Normal", "Gaussian")
AIC_values <- c(AIC_exp, AIC_weibull, AIC_lognormal, AIC_gaussian)
BIC_values <- c(BIC_exp, BIC_weibull, BIC_lognormal, BIC_gaussian)
model_table <- data.frame(Model = model_names, AIC = AIC_values, BIC = BIC_values)
print(model_table)
## Model AIC BIC
## 1 Exponential 5852.019 5857.907
## 2 Weibull 5443.105 5454.882
## 3 Log-Normal 5546.433 5558.209
## 4 Gaussian 5435.924 5447.701