# Library
library(dplyr)
library(ggplot2)
library(caret)
library(ISLR)
library(MASS)
library(glmnet)
library(leaps)
library(corrplot)
# Get Data
df <- read.csv('~/Desktop/data.csv')
attach(df)summary(df)## educate relaffca refaffev relaffsp
## Min. :0.000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :2.000 Median :0.0000 Median :0.00000 Median :0.0000
## Mean :2.207 Mean :0.3778 Mean :0.01482 Mean :0.1148
## 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :3.000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## NA's :6 NA's :6 NA's :6
## relaffno relatend homebrth brthtype
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.0000
## Median :1.0000 Median :1.000 Median :0.000 Median :1.0000
## Mean :0.8481 Mean :1.681 Mean :0.192 Mean :0.6473
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:0.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :4.000 Max. :1.000 Max. :1.0000
## NA's :6 NA's :1
## partdad partdoul partphys partmidt
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.00000
## 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.00000
## Median :1.000 Median :1.000 Median :1.000 Median :0.00000
## Mean :0.971 Mean :0.558 Mean :0.692 Mean :0.07971
## 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:0.00000
## Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.00000
##
## partnurs parttech satisfy placedes
## Min. :0.0000 Min. :0.0000 Min. :0.00 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.75 1st Qu.:1.0000
## Median :1.0000 Median :0.0000 Median :1.00 Median :1.0000
## Mean :0.6341 Mean :0.2899 Mean :0.75 Mean :0.8139
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.00 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.00 Max. :1.0000
## NA's :2
## pregprob brthprob postprob age
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :18.33
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:27.91
## Median :1.0000 Median :0.0000 Median :1.0000 Median :31.85
## Mean :0.6971 Mean :0.4485 Mean :0.8059 Mean :31.37
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:34.80
## Max. :3.0000 Max. :3.0000 Max. :3.0000 Max. :44.61
## NA's :2 NA's :4 NA's :3 NA's :13
## esicospo esiepdpo esiewbpo esiparpo
## Min. :0.000 Min. :0.000 Min. :0.330 Min. :0.000
## 1st Qu.:2.830 1st Qu.:2.000 1st Qu.:2.670 1st Qu.:1.170
## Median :3.500 Median :3.000 Median :3.170 Median :2.000
## Mean :3.211 Mean :2.658 Mean :3.047 Mean :1.996
## 3rd Qu.:3.830 3rd Qu.:3.670 3rd Qu.:3.670 3rd Qu.:2.830
## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000
## NA's :2 NA's :2 NA's :2 NA's :2
## esirelpo
## Min. :0.000
## 1st Qu.:2.670
## Median :3.330
## Mean :3.006
## 3rd Qu.:3.670
## Max. :4.000
## NA's :2
na_count <-sapply(df, function(df) sum(length(which(is.na(df)))))
na_count <- data.frame(na_count)
na_count## na_count
## educate 0
## relaffca 6
## refaffev 6
## relaffsp 6
## relaffno 6
## relatend 0
## homebrth 0
## brthtype 1
## partdad 0
## partdoul 0
## partphys 0
## partmidt 0
## partnurs 0
## parttech 0
## satisfy 0
## placedes 2
## pregprob 2
## brthprob 4
## postprob 3
## age 13
## esicospo 2
## esiepdpo 2
## esiewbpo 2
## esiparpo 2
## esirelpo 2
## Total number of missing values
sum(na_count)## [1] 59
# Missing values in percent
na_count_percent <- na_count/59
na_count_percent## na_count
## educate 0.00000000
## relaffca 0.10169492
## refaffev 0.10169492
## relaffsp 0.10169492
## relaffno 0.10169492
## relatend 0.00000000
## homebrth 0.00000000
## brthtype 0.01694915
## partdad 0.00000000
## partdoul 0.00000000
## partphys 0.00000000
## partmidt 0.00000000
## partnurs 0.00000000
## parttech 0.00000000
## satisfy 0.00000000
## placedes 0.03389831
## pregprob 0.03389831
## brthprob 0.06779661
## postprob 0.05084746
## age 0.22033898
## esicospo 0.03389831
## esiepdpo 0.03389831
## esiewbpo 0.03389831
## esiparpo 0.03389831
## esirelpo 0.03389831
df <- na.omit(df)barplot(table(satisfy), xlab = "Satisfy", ylab = "# of observations")data1 = sort(sample(nrow(df), nrow(df)*.7))
#creating training data set by selecting the output row values
train<-df[data1,]
#creating test data set by not selecting the output row values
test<-df[-data1,]model <- lda(satisfy~., data = train)
model## Call:
## lda(satisfy ~ ., data = train)
##
## Prior probabilities of groups:
## 0 1
## 0.2080925 0.7919075
##
## Group means:
## educate relaffca refaffev relaffsp relaffno relatend homebrth
## 0 2.277778 0.3333333 0.00000000 0.08333333 0.7777778 1.500000 0.0000000
## 1 2.197080 0.3795620 0.02919708 0.14598540 0.8759124 1.817518 0.2262774
## brthtype partdad partdoul partphys partmidt partnurs parttech
## 0 0.1944444 0.9444444 0.4722222 0.8888889 0.08333333 0.6944444 0.3333333
## 1 0.7737226 0.9708029 0.5620438 0.6423358 0.08029197 0.5912409 0.2773723
## placedes pregprob brthprob postprob age esicospo esiepdpo esiewbpo
## 0 0.6666667 0.8888889 0.9722222 1.111111 32.25750 3.102500 2.792778 2.787500
## 1 0.8613139 0.6058394 0.2408759 0.649635 31.06781 3.280511 2.564526 3.121898
## esiparpo esirelpo
## 0 1.985556 2.888611
## 1 2.059635 3.078248
##
## Coefficients of linear discriminants:
## LD1
## educate -0.033100143
## relaffca 0.570697010
## refaffev 0.857066033
## relaffsp 0.001775647
## relaffno -0.377556388
## relatend -0.103111521
## homebrth 0.103767702
## brthtype 2.041168692
## partdad -0.259600572
## partdoul -0.336914726
## partphys -0.437897749
## partmidt -0.330060799
## partnurs -0.091604428
## parttech 0.179613519
## placedes 0.601957955
## pregprob 0.134704827
## brthprob -0.794802922
## postprob -0.073685095
## age -0.019919565
## esicospo 0.528781309
## esiepdpo -0.415384158
## esiewbpo 0.202073909
## esiparpo 0.153260522
## esirelpo 0.233750394
###LDA determines group means and computes, for each individual, the probability of belonging to the different groups. The individual is then affected to the group with the highest probability score.
Group means: group center of gravity. Shows the mean of each variable in each group.
Coefficients of linear discriminants: Shows the linear combination of predictor variables that are used to form the LDA decision rule. 0.14(educate) + 0.51(relaffca) + 1.12(relaffev) - -0.17(relaffsp) - 0.28(relaffno) - 0.04(relatend) + 0.37(homebrth) + 1.78(brthtype) - 0.64(partdad) - 0.33(partdoul) - 0.34(partphys) - 0.13(partmidt)…..-0.14(esirelpo)
pred <- predict(model, test)
pred## $class
## [1] 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0 1 1
## [39] 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## Levels: 0 1
##
## $posterior
## 0 1
## 5 0.7026867223 0.2973132777
## 8 0.0071725577 0.9928274423
## 9 0.0215232454 0.9784767546
## 12 0.0218069683 0.9781930317
## 14 0.0134294039 0.9865705961
## 22 0.9988014956 0.0011985044
## 25 0.0314607361 0.9685392639
## 30 0.0160910370 0.9839089630
## 32 0.0075748713 0.9924251287
## 42 0.0129606613 0.9870393387
## 43 0.0100500711 0.9899499289
## 46 0.0073698200 0.9926301800
## 53 0.0956365038 0.9043634962
## 57 0.0027138977 0.9972861023
## 58 0.2775071088 0.7224928912
## 61 0.0198485879 0.9801514121
## 62 0.0461454303 0.9538545697
## 66 0.2373162616 0.7626837384
## 68 0.0095486275 0.9904513725
## 71 0.0013559694 0.9986440306
## 73 0.0031973639 0.9968026361
## 79 0.0110911255 0.9889088745
## 82 0.0065326325 0.9934673675
## 101 0.0136085120 0.9863914880
## 103 0.0026031279 0.9973968721
## 110 0.0524923756 0.9475076244
## 113 0.0010568937 0.9989431063
## 114 0.0043179878 0.9956820122
## 117 0.0021755601 0.9978244399
## 120 0.2611627323 0.7388372677
## 121 0.9590638805 0.0409361195
## 128 0.6709957342 0.3290042658
## 136 0.9892747791 0.0107252209
## 137 0.0011238371 0.9988761629
## 139 0.2007262994 0.7992737006
## 140 0.5009752342 0.4990247658
## 141 0.2811689077 0.7188310923
## 143 0.1144704636 0.8855295364
## 144 0.7761289610 0.2238710390
## 147 0.9047635375 0.0952364625
## 150 0.0139342107 0.9860657893
## 161 0.8229290488 0.1770709512
## 162 0.8599834450 0.1400165550
## 166 0.0129955992 0.9870044008
## 167 0.0681224285 0.9318775715
## 169 0.9733856955 0.0266143045
## 170 0.9442182381 0.0557817619
## 172 0.0972112838 0.9027887162
## 180 0.7574402010 0.2425597990
## 181 0.9772233984 0.0227766016
## 185 0.9991755212 0.0008244788
## 188 0.9837343710 0.0162656290
## 192 0.2959939056 0.7040060944
## 194 0.1289348684 0.8710651316
## 196 0.8818787513 0.1181212487
## 202 0.4308356588 0.5691643412
## 214 0.0122360497 0.9877639503
## 217 0.1746543075 0.8253456925
## 219 0.9342203757 0.0657796243
## 224 0.4799341472 0.5200658528
## 230 0.0563669673 0.9436330327
## 236 0.0261869439 0.9738130561
## 237 0.0066097424 0.9933902576
## 238 0.0022893683 0.9977106317
## 241 0.0058544735 0.9941455265
## 242 0.0008815263 0.9991184737
## 247 0.0006235033 0.9993764967
## 248 0.0002679588 0.9997320412
## 249 0.0055559024 0.9944440976
## 250 0.0064607728 0.9935392272
## 251 0.1026181228 0.8973818772
## 257 0.0004311075 0.9995688925
## 269 0.0000482518 0.9999517482
## 270 0.0003197653 0.9996802347
## 271 0.0208958269 0.9791041731
##
## $x
## LD1
## 5 -1.62745302
## 8 0.89916819
## 9 0.41332798
## 12 0.40748705
## 14 0.62274081
## 22 -4.18677283
## 25 0.24323421
## 30 0.54266393
## 32 0.87517822
## 42 0.63845051
## 43 0.75071490
## 46 0.88724302
## 53 -0.27181532
## 57 1.32519691
## 58 -0.83462013
## 61 0.44941839
## 62 0.06942248
## 66 -0.74273077
## 68 0.77326900
## 71 1.62855711
## 73 1.25345056
## 79 0.70724718
## 82 0.94022680
## 101 0.61688051
## 103 1.34342879
## 110 0.01027754
## 113 1.73741733
## 114 1.12185425
## 117 1.42190817
## 120 -0.79837166
## 121 -2.62835023
## 128 -1.56312148
## 136 -3.22633031
## 137 1.71059010
## 139 -0.64921704
## 140 -1.25384393
## 141 -0.84255734
## 143 -0.35943683
## 144 -1.79462720
## 147 -2.23449023
## 150 0.60641618
## 161 -1.92250728
## 162 -2.04417470
## 166 0.63726039
## 167 -0.11070879
## 169 -2.82269253
## 170 -2.48652300
## 172 -0.27970228
## 180 -1.74900635
## 181 -2.89235501
## 185 -4.35016347
## 188 -3.04216109
## 192 -0.87407135
## 194 -0.41854397
## 196 -2.12934511
## 202 -1.13064475
## 214 0.66387463
## 217 -0.57450056
## 219 -2.40994088
## 224 -1.21710041
## 230 -0.02258491
## 236 0.32566402
## 237 0.93507256
## 238 1.39960926
## 241 0.98834970
## 242 1.81666203
## 247 1.96788132
## 248 2.33653907
## 249 1.01132131
## 250 0.94508481
## 251 -0.30594181
## 257 2.12897522
## 269 3.08470511
## 270 2.25939064
## 271 0.42651655
pred_class = pred$class
table(pred_class, test$satisfy)##
## pred_class 0 1
## 0 13 5
## 1 9 48
mean(pred_class==test$satisfy)## [1] 0.8133333
plot(model)####Correlation Plot: To visualoze the corrrelation among variables in the dataset.
M <- cor(df)
corrplot(M, method = "circle")Positive correlations are displayed in blue and negative correlations in red color. Color intensity and the size of the circle are proportional to the correlation coefficients.
From the above plot we can see that Satisfy is highly correlated to variables;relatend, brthtype, placedes, and esipdpo.
df$age = factor(df$age)
logmodel <- glm(satisfy ~ educate+relaffca+refaffev+ relaffsp+ relaffno+ relatend+ homebrth+
brthtype+ partdad+ partdoul+ partphys+ partmidt+ partnurs+ parttech+ satisfy +
placedes+ pregprob+ brthprob+ postprob+esicospo +esiepdpo +esiewbpo +esiparpo+esirelpo, data = df)
summary(logmodel)##
## Call:
## glm(formula = satisfy ~ educate + relaffca + refaffev + relaffsp +
## relaffno + relatend + homebrth + brthtype + partdad + partdoul +
## partphys + partmidt + partnurs + parttech + satisfy + placedes +
## pregprob + brthprob + postprob + esicospo + esiepdpo + esiewbpo +
## esiparpo + esirelpo, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.90367 -0.15971 0.05071 0.19897 0.70147
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.144635 0.209588 0.690 0.490853
## educate -0.007587 0.039194 -0.194 0.846691
## relaffca 0.121880 0.052048 2.342 0.020074 *
## refaffev 0.146378 0.175283 0.835 0.404555
## relaffsp 0.057649 0.077611 0.743 0.458387
## relaffno -0.025556 0.078968 -0.324 0.746524
## relatend -0.002147 0.021051 -0.102 0.918839
## homebrth 0.074812 0.076104 0.983 0.326659
## brthtype 0.388775 0.053837 7.221 7.95e-12 ***
## partdad 0.049392 0.126288 0.391 0.696089
## partdoul -0.051712 0.050002 -1.034 0.302158
## partphys -0.057829 0.061732 -0.937 0.349880
## partmidt -0.017824 0.086377 -0.206 0.836708
## partnurs -0.076802 0.049559 -1.550 0.122621
## parttech 0.061682 0.053027 1.163 0.245979
## placedes 0.098954 0.060173 1.644 0.101477
## pregprob 0.035768 0.031828 1.124 0.262304
## brthprob -0.134539 0.035969 -3.740 0.000233 ***
## postprob -0.027609 0.029985 -0.921 0.358168
## esicospo 0.092799 0.057603 1.611 0.108590
## esiepdpo -0.090236 0.026590 -3.394 0.000816 ***
## esiewbpo 0.045991 0.030310 1.517 0.130593
## esiparpo 0.038407 0.026729 1.437 0.152133
## esirelpo 0.028552 0.053181 0.537 0.591885
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1105742)
##
## Null deviance: 44.435 on 247 degrees of freedom
## Residual deviance: 24.769 on 224 degrees of freedom
## AIC: 182.44
##
## Number of Fisher Scoring iterations: 2
Deviance Residuals: Are good as they are close to 0 and roughly symmetrical
coefficients: We are interested in the part of output that shows the coefficients, their standard errors, the z-statistic (sometimes called a Wald z-statistic), and the associated p-values. Relaffca, brthprob, brthtype, and esiepdpo are statistically significant, as their corresponding p-values are less that 0.05. The logistic regression coefficients give the change in the log odds of the outcome for a one unit increase in the predictor variable.
For every one unit change in relaffca, the log odds of Satisfy (versus not satisfied) increases by 0.12.
For every one unit change in brthtype, the log odds of Satisfy (versus not satisfied) increases by 0.38.
For every one unit change in brthprob, the log odds of Satisfy (versus not satisfied) decreases by 0.13.
For every one unit change in esiepdpo, the log odds of Satisfy (versus not satisfied) decreases by 0.09.
pred.data = data.frame(probability.of.sat=logmodel$fitted.values, sat=df$satisfy)
pred.data= pred.data[order(pred.data$probability.of.sat, decreasing = FALSE),]
pred.data$rank = 1:nrow(pred.data)
library(ggplot2)
library(cowplot)
ggplot(data=pred.data, aes(x=rank, y=probability.of.sat)) +
geom_point(aes(color='sat'), alpha=1, shape=4, stroke=2)+ xlab("index")+ylab("prob")