#1) Select a dataset of your choice ___ Unemployement dataset
#instaling package survival
library(survival)
#read the file and store it in data1 variable
data1 <- read.csv("survival_unemployment.csv")
#structure of the dataset
str(data1)
## 'data.frame':    3343 obs. of  43 variables:
##  $ spell   : int  5 13 21 3 9 11 1 3 7 5 ...
##  $ censor1 : int  1 1 1 1 0 0 0 1 1 0 ...
##  $ censor2 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ censor3 : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ censor4 : int  0 0 0 0 0 1 0 0 0 1 ...
##  $ ui      : int  0 1 1 1 1 1 0 0 1 1 ...
##  $ reprate : num  0.179 0.52 0.204 0.448 0.32 0.187 0.52 0.373 0.52 0.52 ...
##  $ logwage : num  6.9 5.29 6.77 5.98 6.32 ...
##  $ tenure  : int  3 6 1 3 0 9 1 0 2 1 ...
##  $ disrate : num  0.045 0.13 0.051 0.112 0.08 0.047 0.13 0.093 0.13 0.13 ...
##  $ slack   : int  1 1 1 1 0 0 1 1 1 1 ...
##  $ abolpos : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ explose : int  0 0 0 0 1 1 0 0 1 1 ...
##  $ stateur : num  6.6 6.6 6.6 6.6 6.6 6.6 6.6 6.6 6.6 6.6 ...
##  $ houshead: int  1 1 0 1 1 1 1 1 1 0 ...
##  $ married : int  1 1 0 1 0 1 1 1 1 1 ...
##  $ female  : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ child   : int  1 1 0 1 0 1 1 1 1 0 ...
##  $ ychild  : int  1 1 0 1 0 1 1 1 0 0 ...
##  $ nonwhite: int  0 0 0 1 0 0 1 0 0 0 ...
##  $ age     : int  41 30 36 26 22 43 24 32 35 31 ...
##  $ schlt12 : int  0 1 0 1 0 0 0 0 0 0 ...
##  $ schgt12 : int  1 0 0 0 1 0 1 0 1 0 ...
##  $ smsa    : int  1 1 1 1 1 0 0 1 0 1 ...
##  $ bluecoll: int  0 1 1 1 1 1 0 1 1 0 ...
##  $ mining  : int  1 1 1 0 1 0 0 0 0 0 ...
##  $ constr  : int  0 0 0 1 0 0 0 1 0 0 ...
##  $ transp  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trade   : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ fire    : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ services: int  0 0 0 0 0 0 1 0 0 0 ...
##  $ pubadmin: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ year85  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ year87  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ year89  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ midatl  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ encen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ wncen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ southatl: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ escen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ wscen   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ mountain: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pacific : int  0 0 0 0 0 0 0 0 0 0 ...
#2) Clean, augment, and preprocess the data into a more convenient form, if needed.  
#Format variables as necessary ____ 
any(is.na(data1))#no missing values
## [1] FALSE
time <- data1$spell#converting variable spell name into time
event <- data1$censor1#converting variable censor1 name into event
#3) Perform a preliminary inspection of each dataset 
summary(time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   5.000   6.248   9.000  28.000
summary(event)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.321   1.000   1.000
#4) Describe a problem statement as it relates to using survival 
#(time-to-event) analysis approach for prediction ____ 
##How many people got the job in their first attempt? 
#5) Compute survival curves and plot survival curve (Kaplan-Meier) plots  ____ 
kmsurvival <- survfit(Surv(time,event)~1)
#6) Display a summary of the survival curves. ____ 
summary(kmsurvival)
## Call: survfit(formula = Surv(time, event) ~ 1)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1   3343     294    0.912 0.00490        0.903        0.922
##     2   2803     178    0.854 0.00622        0.842        0.866
##     3   2321     119    0.810 0.00708        0.797        0.824
##     4   1897      56    0.786 0.00756        0.772        0.801
##     5   1676     104    0.738 0.00847        0.721        0.754
##     6   1339      32    0.720 0.00882        0.703        0.737
##     7   1196      85    0.669 0.00979        0.650        0.688
##     8    933      15    0.658 0.01001        0.639        0.678
##     9    848      33    0.632 0.01057        0.612        0.654
##    10    717       3    0.630 0.01064        0.609        0.651
##    11    659      26    0.605 0.01128        0.583        0.627
##    12    556       7    0.597 0.01150        0.575        0.620
##    13    509      25    0.568 0.01234        0.544        0.593
##    14    415      30    0.527 0.01353        0.501        0.554
##    15    311      19    0.495 0.01458        0.467        0.524
##    16    252      10    0.475 0.01527        0.446        0.506
##    17    201       8    0.456 0.01606        0.426        0.489
##    18    169       7    0.437 0.01691        0.405        0.472
##    19    149       4    0.426 0.01744        0.393        0.461
##    20    130       3    0.416 0.01794        0.382        0.452
##    21    109       4    0.400 0.01883        0.365        0.439
##    22     82       4    0.381 0.02029        0.343        0.423
##    26     48       2    0.365 0.02233        0.324        0.412
##    27     33       5    0.310 0.02964        0.257        0.374
#6) Display a summary of the survival curves. ____ 
plot(kmsurvival,xlab = "Time",ylab = "Survival Probability")

#7) Provide a summary of your results and explain the outcomes ____ 
#for row 1 we can see that there are 3343 are at risk for 1st time where there are 294 events
#with survival rate 91%. As we can see suurvival rate is decreasing as the time is increasing.
#From this we can conclude that 1st period 294 people found the job.
#plot showing decreaing over time, it starts from 1 and at the 28th approx .30.