#1) Select a dataset of your choice ___ Unemployement dataset
#instaling package survival
library(survival)
#read the file and store it in data1 variable
data1 <- read.csv("survival_unemployment.csv")
#structure of the dataset
str(data1)
## 'data.frame': 3343 obs. of 43 variables:
## $ spell : int 5 13 21 3 9 11 1 3 7 5 ...
## $ censor1 : int 1 1 1 1 0 0 0 1 1 0 ...
## $ censor2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ censor3 : int 0 0 0 0 1 0 0 0 0 0 ...
## $ censor4 : int 0 0 0 0 0 1 0 0 0 1 ...
## $ ui : int 0 1 1 1 1 1 0 0 1 1 ...
## $ reprate : num 0.179 0.52 0.204 0.448 0.32 0.187 0.52 0.373 0.52 0.52 ...
## $ logwage : num 6.9 5.29 6.77 5.98 6.32 ...
## $ tenure : int 3 6 1 3 0 9 1 0 2 1 ...
## $ disrate : num 0.045 0.13 0.051 0.112 0.08 0.047 0.13 0.093 0.13 0.13 ...
## $ slack : int 1 1 1 1 0 0 1 1 1 1 ...
## $ abolpos : int 0 0 0 0 1 0 0 0 0 0 ...
## $ explose : int 0 0 0 0 1 1 0 0 1 1 ...
## $ stateur : num 6.6 6.6 6.6 6.6 6.6 6.6 6.6 6.6 6.6 6.6 ...
## $ houshead: int 1 1 0 1 1 1 1 1 1 0 ...
## $ married : int 1 1 0 1 0 1 1 1 1 1 ...
## $ female : int 0 0 0 0 0 0 0 0 0 1 ...
## $ child : int 1 1 0 1 0 1 1 1 1 0 ...
## $ ychild : int 1 1 0 1 0 1 1 1 0 0 ...
## $ nonwhite: int 0 0 0 1 0 0 1 0 0 0 ...
## $ age : int 41 30 36 26 22 43 24 32 35 31 ...
## $ schlt12 : int 0 1 0 1 0 0 0 0 0 0 ...
## $ schgt12 : int 1 0 0 0 1 0 1 0 1 0 ...
## $ smsa : int 1 1 1 1 1 0 0 1 0 1 ...
## $ bluecoll: int 0 1 1 1 1 1 0 1 1 0 ...
## $ mining : int 1 1 1 0 1 0 0 0 0 0 ...
## $ constr : int 0 0 0 1 0 0 0 1 0 0 ...
## $ transp : int 0 0 0 0 0 0 0 0 0 0 ...
## $ trade : int 0 0 0 0 0 1 0 0 0 0 ...
## $ fire : int 0 0 0 0 0 0 0 0 0 1 ...
## $ services: int 0 0 0 0 0 0 1 0 0 0 ...
## $ pubadmin: int 0 0 0 0 0 0 0 0 0 0 ...
## $ year85 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ year87 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ year89 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ midatl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ encen : int 0 0 0 0 0 0 0 0 0 0 ...
## $ wncen : int 0 0 0 0 0 0 0 0 0 0 ...
## $ southatl: int 0 0 0 0 0 0 0 0 0 0 ...
## $ escen : int 0 0 0 0 0 0 0 0 0 0 ...
## $ wscen : int 1 1 1 1 1 1 1 1 1 1 ...
## $ mountain: int 0 0 0 0 0 0 0 0 0 0 ...
## $ pacific : int 0 0 0 0 0 0 0 0 0 0 ...
#2) Clean, augment, and preprocess the data into a more convenient form, if needed.
#Format variables as necessary ____
any(is.na(data1))#no missing values
## [1] FALSE
time <- data1$spell#converting variable spell name into time
event <- data1$censor1#converting variable censor1 name into event
#3) Perform a preliminary inspection of each dataset
summary(time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 5.000 6.248 9.000 28.000
summary(event)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.321 1.000 1.000
#4) Describe a problem statement as it relates to using survival
#(time-to-event) analysis approach for prediction ____
##How many people got the job in their first attempt?
#5) Compute survival curves and plot survival curve (Kaplan-Meier) plots ____
kmsurvival <- survfit(Surv(time,event)~1)
#6) Display a summary of the survival curves. ____
summary(kmsurvival)
## Call: survfit(formula = Surv(time, event) ~ 1)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 3343 294 0.912 0.00490 0.903 0.922
## 2 2803 178 0.854 0.00622 0.842 0.866
## 3 2321 119 0.810 0.00708 0.797 0.824
## 4 1897 56 0.786 0.00756 0.772 0.801
## 5 1676 104 0.738 0.00847 0.721 0.754
## 6 1339 32 0.720 0.00882 0.703 0.737
## 7 1196 85 0.669 0.00979 0.650 0.688
## 8 933 15 0.658 0.01001 0.639 0.678
## 9 848 33 0.632 0.01057 0.612 0.654
## 10 717 3 0.630 0.01064 0.609 0.651
## 11 659 26 0.605 0.01128 0.583 0.627
## 12 556 7 0.597 0.01150 0.575 0.620
## 13 509 25 0.568 0.01234 0.544 0.593
## 14 415 30 0.527 0.01353 0.501 0.554
## 15 311 19 0.495 0.01458 0.467 0.524
## 16 252 10 0.475 0.01527 0.446 0.506
## 17 201 8 0.456 0.01606 0.426 0.489
## 18 169 7 0.437 0.01691 0.405 0.472
## 19 149 4 0.426 0.01744 0.393 0.461
## 20 130 3 0.416 0.01794 0.382 0.452
## 21 109 4 0.400 0.01883 0.365 0.439
## 22 82 4 0.381 0.02029 0.343 0.423
## 26 48 2 0.365 0.02233 0.324 0.412
## 27 33 5 0.310 0.02964 0.257 0.374
#6) Display a summary of the survival curves. ____
plot(kmsurvival,xlab = "Time",ylab = "Survival Probability")

#7) Provide a summary of your results and explain the outcomes ____
#for row 1 we can see that there are 3343 are at risk for 1st time where there are 294 events
#with survival rate 91%. As we can see suurvival rate is decreasing as the time is increasing.
#From this we can conclude that 1st period 294 people found the job.
#plot showing decreaing over time, it starts from 1 and at the 28th approx .30.