insurance<-read.csv("C:\\Users\\user\\Desktop\\myR\\insurance.csv")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#Step1
summary(insurance) 
##       age            sex           bmi           children     smoker    
##  Min.   :18.00   female:662   Min.   :15.96   Min.   :0.000   no :1064  
##  1st Qu.:27.00   male  :676   1st Qu.:26.30   1st Qu.:0.000   yes: 274  
##  Median :39.00                Median :30.40   Median :1.000             
##  Mean   :39.21                Mean   :30.66   Mean   :1.095             
##  3rd Qu.:51.00                3rd Qu.:34.69   3rd Qu.:2.000             
##  Max.   :64.00                Max.   :53.13   Max.   :5.000             
##        region       charges     
##  northeast:324   Min.   : 1122  
##  northwest:325   1st Qu.: 4740  
##  southeast:364   Median : 9382  
##  southwest:325   Mean   :13270  
##                  3rd Qu.:16640  
##                  Max.   :63770
#I would like to set up the standard to the top 25% of charges which is 16640$ to dichotomize the numeric response variable 'charges'. The response variable ‘charges’ is the individual medical costs billed by health insurance. 

highcharge<-ifelse(insurance$charges<=16640,"No","Yes")
insurance<-data.frame(insurance,highcharge)
#Step2
ggplot(insurance, aes(smoker, fill=highcharge))+
  geom_bar(position="fill")

modS<-glm(highcharge~smoker, insurance, family="binomial")
summary(modS)
## 
## Call:
## glm(formula = highcharge ~ smoker, family = "binomial", data = insurance)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3103  -0.3954  -0.3954   0.1855   2.2750  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.5096     0.1163  -21.59   <2e-16 ***
## smokeryes     5.1064     0.2647   19.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1505.91  on 1337  degrees of freedom
## Residual deviance:  705.93  on 1336  degrees of freedom
## AIC: 709.93
## 
## Number of Fisher Scoring iterations: 5
library(rpart)
set.seed(1)
dim(insurance)
## [1] 1338    8
train<-sample(1:nrow(insurance), 892)
tree.insurance<-rpart(highcharge~.-charges, data=insurance,
                     subset=train,
                     control=rpart.control(minsplit=1),
                     method="class")
summary(tree.insurance)
## Call:
## rpart(formula = highcharge ~ . - charges, data = insurance, subset = train, 
##     method = "class", control = rpart.control(minsplit = 1))
##   n= 892 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.71495327      0 1.0000000 1.0000000 0.05959716
## 2 0.01869159      1 0.2850467 0.2850467 0.03522648
## 3 0.01000000      3 0.2476636 0.2616822 0.03385330
## 
## Variable importance
## smoker    bmi    age 
##     92      5      2 
## 
## Node number 1: 892 observations,    complexity param=0.7149533
##   predicted class=No   expected loss=0.2399103  P(node) =1
##     class counts:   678   214
##    probabilities: 0.760 0.240 
##   left son=2 (707 obs) right son=3 (185 obs)
##   Primary splits:
##       smoker splits as  LR,          improve=211.814400, (0 missing)
##       age    < 41.5    to the left,  improve=  2.563925, (0 missing)
##       region splits as  RLRL,        improve=  2.347650, (0 missing)
##       sex    splits as  LR,          improve=  2.205411, (0 missing)
##       bmi    < 22.5575 to the left,  improve=  2.110439, (0 missing)
## 
## Node number 2: 707 observations
##   predicted class=No   expected loss=0.06364922  P(node) =0.7926009
##     class counts:   662    45
##    probabilities: 0.936 0.064 
## 
## Node number 3: 185 observations,    complexity param=0.01869159
##   predicted class=Yes  expected loss=0.08648649  P(node) =0.2073991
##     class counts:    16   169
##    probabilities: 0.086 0.914 
##   left son=6 (21 obs) right son=7 (164 obs)
##   Primary splits:
##       bmi      < 22.51   to the left,  improve=9.06112000, (0 missing)
##       age      < 29.5    to the left,  improve=5.28365100, (0 missing)
##       region   splits as  LLRL,        improve=1.42915400, (0 missing)
##       children < 0.5     to the left,  improve=0.37048590, (0 missing)
##       sex      splits as  LR,          improve=0.07995528, (0 missing)
## 
## Node number 6: 21 observations,    complexity param=0.01869159
##   predicted class=No   expected loss=0.4761905  P(node) =0.0235426
##     class counts:    11    10
##    probabilities: 0.524 0.476 
##   left son=12 (10 obs) right son=13 (11 obs)
##   Primary splits:
##       age      < 28      to the left,  improve=5.40346300, (0 missing)
##       bmi      < 21.825  to the right, improve=1.58730200, (0 missing)
##       region   splits as  LLRL,        improve=1.21303300, (0 missing)
##       children < 0.5     to the left,  improve=0.64285710, (0 missing)
##       sex      splits as  LR,          improve=0.00952381, (0 missing)
##   Surrogate splits:
##       bmi      < 20.3775 to the right, agree=0.667, adj=0.3, (0 split)
##       sex      splits as  LR,          agree=0.571, adj=0.1, (0 split)
##       children < 0.5     to the left,  agree=0.571, adj=0.1, (0 split)
##       region   splits as  RLRR,        agree=0.571, adj=0.1, (0 split)
## 
## Node number 7: 164 observations
##   predicted class=Yes  expected loss=0.0304878  P(node) =0.1838565
##     class counts:     5   159
##    probabilities: 0.030 0.970 
## 
## Node number 12: 10 observations
##   predicted class=No   expected loss=0.1  P(node) =0.01121076
##     class counts:     9     1
##    probabilities: 0.900 0.100 
## 
## Node number 13: 11 observations
##   predicted class=Yes  expected loss=0.1818182  P(node) =0.01233184
##     class counts:     2     9
##    probabilities: 0.182 0.818
plot(tree.insurance , uniform=TRUE,margin=0.2,
     main="Classification Tree for Insurance Charges")
text(tree.insurance , use.n=TRUE, all=TRUE, cex=.8)

tree.insurance$cptable
##           CP nsplit rel error    xerror       xstd
## 1 0.71495327      0 1.0000000 1.0000000 0.05959716
## 2 0.01869159      1 0.2850467 0.2850467 0.03522648
## 3 0.01000000      3 0.2476636 0.2616822 0.03385330
#install.packages("rpart.plot")
library(rpart.plot)
prp(tree.insurance, faclen = 0, cex = 0.7, extra=1, space=.5)

prune.insurance<- prune(tree.insurance, cp=0.01) # from cptable   

# plot the pruned tree
par(mfrow=c(1,1)) 
plot(prune.insurance, uniform=TRUE,margin=0.2,
     main="Pruned Regression Tree for Nsplit=3")
text(prune.insurance, use.n=TRUE, all=TRUE, cex=.8)

highcharge.test<-insurance$highcharge[-train]

# what we predict
tree.pred2<-predict(prune.insurance, newdata =insurance , type="class")
test.pred<-tree.pred2[-train]
cm<-table(test.pred, highcharge.test)
cm
##          highcharge.test
## test.pred  No Yes
##       No  323  35
##       Yes   2  86
sum(diag(cm))/sum(cm)
## [1] 0.9170404