insurance<-read.csv("C:\\Users\\user\\Desktop\\myR\\insurance.csv")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#Step1
summary(insurance)
## age sex bmi children smoker
## Min. :18.00 female:662 Min. :15.96 Min. :0.000 no :1064
## 1st Qu.:27.00 male :676 1st Qu.:26.30 1st Qu.:0.000 yes: 274
## Median :39.00 Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## region charges
## northeast:324 Min. : 1122
## northwest:325 1st Qu.: 4740
## southeast:364 Median : 9382
## southwest:325 Mean :13270
## 3rd Qu.:16640
## Max. :63770
#I would like to set up the standard to the top 25% of charges which is 16640$ to dichotomize the numeric response variable 'charges'. The response variable ‘charges’ is the individual medical costs billed by health insurance.
highcharge<-ifelse(insurance$charges<=16640,"No","Yes")
insurance<-data.frame(insurance,highcharge)
#Step2
ggplot(insurance, aes(smoker, fill=highcharge))+
geom_bar(position="fill")

modS<-glm(highcharge~smoker, insurance, family="binomial")
summary(modS)
##
## Call:
## glm(formula = highcharge ~ smoker, family = "binomial", data = insurance)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3103 -0.3954 -0.3954 0.1855 2.2750
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.5096 0.1163 -21.59 <2e-16 ***
## smokeryes 5.1064 0.2647 19.29 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1505.91 on 1337 degrees of freedom
## Residual deviance: 705.93 on 1336 degrees of freedom
## AIC: 709.93
##
## Number of Fisher Scoring iterations: 5
library(rpart)
set.seed(1)
dim(insurance)
## [1] 1338 8
train<-sample(1:nrow(insurance), 892)
tree.insurance<-rpart(highcharge~.-charges, data=insurance,
subset=train,
control=rpart.control(minsplit=1),
method="class")
summary(tree.insurance)
## Call:
## rpart(formula = highcharge ~ . - charges, data = insurance, subset = train,
## method = "class", control = rpart.control(minsplit = 1))
## n= 892
##
## CP nsplit rel error xerror xstd
## 1 0.71495327 0 1.0000000 1.0000000 0.05959716
## 2 0.01869159 1 0.2850467 0.2850467 0.03522648
## 3 0.01000000 3 0.2476636 0.2616822 0.03385330
##
## Variable importance
## smoker bmi age
## 92 5 2
##
## Node number 1: 892 observations, complexity param=0.7149533
## predicted class=No expected loss=0.2399103 P(node) =1
## class counts: 678 214
## probabilities: 0.760 0.240
## left son=2 (707 obs) right son=3 (185 obs)
## Primary splits:
## smoker splits as LR, improve=211.814400, (0 missing)
## age < 41.5 to the left, improve= 2.563925, (0 missing)
## region splits as RLRL, improve= 2.347650, (0 missing)
## sex splits as LR, improve= 2.205411, (0 missing)
## bmi < 22.5575 to the left, improve= 2.110439, (0 missing)
##
## Node number 2: 707 observations
## predicted class=No expected loss=0.06364922 P(node) =0.7926009
## class counts: 662 45
## probabilities: 0.936 0.064
##
## Node number 3: 185 observations, complexity param=0.01869159
## predicted class=Yes expected loss=0.08648649 P(node) =0.2073991
## class counts: 16 169
## probabilities: 0.086 0.914
## left son=6 (21 obs) right son=7 (164 obs)
## Primary splits:
## bmi < 22.51 to the left, improve=9.06112000, (0 missing)
## age < 29.5 to the left, improve=5.28365100, (0 missing)
## region splits as LLRL, improve=1.42915400, (0 missing)
## children < 0.5 to the left, improve=0.37048590, (0 missing)
## sex splits as LR, improve=0.07995528, (0 missing)
##
## Node number 6: 21 observations, complexity param=0.01869159
## predicted class=No expected loss=0.4761905 P(node) =0.0235426
## class counts: 11 10
## probabilities: 0.524 0.476
## left son=12 (10 obs) right son=13 (11 obs)
## Primary splits:
## age < 28 to the left, improve=5.40346300, (0 missing)
## bmi < 21.825 to the right, improve=1.58730200, (0 missing)
## region splits as LLRL, improve=1.21303300, (0 missing)
## children < 0.5 to the left, improve=0.64285710, (0 missing)
## sex splits as LR, improve=0.00952381, (0 missing)
## Surrogate splits:
## bmi < 20.3775 to the right, agree=0.667, adj=0.3, (0 split)
## sex splits as LR, agree=0.571, adj=0.1, (0 split)
## children < 0.5 to the left, agree=0.571, adj=0.1, (0 split)
## region splits as RLRR, agree=0.571, adj=0.1, (0 split)
##
## Node number 7: 164 observations
## predicted class=Yes expected loss=0.0304878 P(node) =0.1838565
## class counts: 5 159
## probabilities: 0.030 0.970
##
## Node number 12: 10 observations
## predicted class=No expected loss=0.1 P(node) =0.01121076
## class counts: 9 1
## probabilities: 0.900 0.100
##
## Node number 13: 11 observations
## predicted class=Yes expected loss=0.1818182 P(node) =0.01233184
## class counts: 2 9
## probabilities: 0.182 0.818
plot(tree.insurance , uniform=TRUE,margin=0.2,
main="Classification Tree for Insurance Charges")
text(tree.insurance , use.n=TRUE, all=TRUE, cex=.8)

tree.insurance$cptable
## CP nsplit rel error xerror xstd
## 1 0.71495327 0 1.0000000 1.0000000 0.05959716
## 2 0.01869159 1 0.2850467 0.2850467 0.03522648
## 3 0.01000000 3 0.2476636 0.2616822 0.03385330
#install.packages("rpart.plot")
library(rpart.plot)
prp(tree.insurance, faclen = 0, cex = 0.7, extra=1, space=.5)

prune.insurance<- prune(tree.insurance, cp=0.01) # from cptable
# plot the pruned tree
par(mfrow=c(1,1))
plot(prune.insurance, uniform=TRUE,margin=0.2,
main="Pruned Regression Tree for Nsplit=3")
text(prune.insurance, use.n=TRUE, all=TRUE, cex=.8)

highcharge.test<-insurance$highcharge[-train]
# what we predict
tree.pred2<-predict(prune.insurance, newdata =insurance , type="class")
test.pred<-tree.pred2[-train]
cm<-table(test.pred, highcharge.test)
cm
## highcharge.test
## test.pred No Yes
## No 323 35
## Yes 2 86
sum(diag(cm))/sum(cm)
## [1] 0.9170404