分析醫療費用

讀取資料

insurance <- read.csv('https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/insurance.csv')
class(insurance)
## [1] "data.frame"
str(insurance)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
##  $ region  : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
##  $ charges : num  16885 1726 4449 21984 3867 ...
head(insurance)
##   age    sex    bmi children smoker    region   charges
## 1  19 female 27.900        0    yes southwest 16884.924
## 2  18   male 33.770        1     no southeast  1725.552
## 3  28   male 33.000        3     no southeast  4449.462
## 4  33   male 22.705        0     no northwest 21984.471
## 5  32   male 28.880        0     no northwest  3866.855
## 6  31 female 25.740        0     no southeast  3756.622

探索資料

boxplot(charges ~ smoker, data = insurance)

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4

ggplot(insurance, aes(x=smoker, y= charges, fill=smoker)) + geom_boxplot()

plot(charges ~ age, data = insurance, col= smoker)

ggplot(insurance, aes(x=age, y= charges)) + geom_point(aes(colour=smoker))

## smokers pays more


plot(charges ~ bmi, data = insurance, col=smoker, pch = 19)
abline(v = 30, col = 'blue', lwd = 3)

ggplot(insurance, aes(x=bmi, y= charges)) + geom_point(aes(colour=smoker))

insurance$bmi_bin <- ifelse(insurance$bmi > 30, 'yes', 'no')
insurance$bmi_bin <- as.factor(insurance$bmi_bin)

boxplot(charges ~ bmi_bin, data = insurance)

ggplot(insurance, aes(x=bmi_bin, y= charges, fill=bmi_bin)) + geom_boxplot()

## High bmi patients pays more


plot(charges ~ age, data = insurance, pch = 19)
fit1 <- lm(charges ~ age, data = insurance)
abline(fit1, col = 'red')

fit2 <- lm(charges ~ poly(age,2), data = insurance)
predicted <- predict(fit2, insurance)
plot(charges ~ age, data = insurance, pch = 19)
lines(insurance$age[order(insurance$age)], predicted[order(insurance$age)], col ='red')

## old people pays more

資料預處理

head(insurance)
##   age    sex    bmi children smoker    region   charges bmi_bin
## 1  19 female 27.900        0    yes southwest 16884.924      no
## 2  18   male 33.770        1     no southeast  1725.552     yes
## 3  28   male 33.000        3     no southeast  4449.462     yes
## 4  33   male 22.705        0     no northwest 21984.471      no
## 5  32   male 28.880        0     no northwest  3866.855      no
## 6  31 female 25.740        0     no southeast  3756.622      no
# remove bmi_bin
insurance$bmi_bin <- NULL

## creaet dummy variable
table(insurance$region)
## 
## northeast northwest southeast southwest 
##       324       325       364       325
insurance$regionnorthwest <- ifelse(insurance$region == 'northwest', 1, 0)
insurance$regionsoutheast <- ifelse(insurance$region == 'southeast', 1, 0)
insurance$regionsouthwest<- ifelse(insurance$region == 'southwest', 1, 0)

insurance$region <- NULL

# create sexmale
insurance$sexmale <- ifelse(insurance$sex == 'male', 1, 0)
insurance$sex <- NULL

head(insurance)
##   age    bmi children smoker   charges regionnorthwest regionsoutheast
## 1  19 27.900        0    yes 16884.924               0               0
## 2  18 33.770        1     no  1725.552               0               1
## 3  28 33.000        3     no  4449.462               0               1
## 4  33 22.705        0     no 21984.471               1               0
## 5  32 28.880        0     no  3866.855               1               0
## 6  31 25.740        0     no  3756.622               0               1
##   regionsouthwest sexmale
## 1               1       0
## 2               0       1
## 3               0       1
## 4               0       1
## 5               0       1
## 6               0       0
# create smokeryes
insurance$smokeryes <- ifelse(insurance$smoker == 'yes', 1, 0)
insurance$smoker <- NULL
head(insurance)
##   age    bmi children   charges regionnorthwest regionsoutheast
## 1  19 27.900        0 16884.924               0               0
## 2  18 33.770        1  1725.552               0               1
## 3  28 33.000        3  4449.462               0               1
## 4  33 22.705        0 21984.471               1               0
## 5  32 28.880        0  3866.855               1               0
## 6  31 25.740        0  3756.622               0               1
##   regionsouthwest sexmale smokeryes
## 1               1       0         1
## 2               0       1         0
## 3               0       1         0
## 4               0       1         0
## 5               0       1         0
## 6               0       0         0

建立迴歸模型

# method 1
ins_model <- glm(charges ~ ., data = insurance, family = 'gaussian')


# method 2
ins_model <- lm(charges ~ ., data = insurance)
# lm = glm(family= gaussian)

summary(ins_model)
## 
## Call:
## lm(formula = charges ~ ., data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11304.9  -2848.1   -982.1   1393.9  29992.8 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11938.5      987.8 -12.086  < 2e-16 ***
## age                256.9       11.9  21.587  < 2e-16 ***
## bmi                339.2       28.6  11.860  < 2e-16 ***
## children           475.5      137.8   3.451 0.000577 ***
## regionnorthwest   -353.0      476.3  -0.741 0.458769    
## regionsoutheast  -1035.0      478.7  -2.162 0.030782 *  
## regionsouthwest   -960.0      477.9  -2.009 0.044765 *  
## sexmale           -131.3      332.9  -0.394 0.693348    
## smokeryes        23848.5      413.1  57.723  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared:  0.7509, Adjusted R-squared:  0.7494 
## F-statistic: 500.8 on 8 and 1329 DF,  p-value: < 2.2e-16
hist(ins_model$residuals)

調整變數

insurance$age2  <- insurance$age ^ 2
insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)


ins_model2 <- lm(charges ~ age + age2 + children + bmi + sexmale +
bmi30*smokeryes + regionnorthwest + regionsouthwest+ regionsoutheast, data = insurance)

summary(ins_model2)
## 
## Call:
## lm(formula = charges ~ age + age2 + children + bmi + sexmale + 
##     bmi30 * smokeryes + regionnorthwest + regionsouthwest + regionsoutheast, 
##     data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17296.4  -1656.0  -1263.3   -722.1  24160.2 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       134.2509  1362.7511   0.099 0.921539    
## age               -32.6851    59.8242  -0.546 0.584915    
## age2                3.7316     0.7463   5.000 6.50e-07 ***
## children          678.5612   105.8831   6.409 2.04e-10 ***
## bmi               120.0196    34.2660   3.503 0.000476 ***
## sexmale          -496.8245   244.3659  -2.033 0.042240 *  
## bmi30           -1000.1403   422.8402  -2.365 0.018159 *  
## smokeryes       13404.6866   439.9491  30.469  < 2e-16 ***
## regionnorthwest  -279.2038   349.2746  -0.799 0.424212    
## regionsouthwest -1222.6437   350.5285  -3.488 0.000503 ***
## regionsoutheast  -828.5467   351.6352  -2.356 0.018604 *  
## bmi30:smokeryes 19810.7533   604.6567  32.764  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4445 on 1326 degrees of freedom
## Multiple R-squared:  0.8664, Adjusted R-squared:  0.8653 
## F-statistic: 781.7 on 11 and 1326 DF,  p-value: < 2.2e-16
ins_model2_step <- step(ins_model2)
## Start:  AIC=22488.97
## charges ~ age + age2 + children + bmi + sexmale + bmi30 * smokeryes + 
##     regionnorthwest + regionsouthwest + regionsoutheast
## 
##                   Df  Sum of Sq        RSS   AIC
## - age              1 5.8972e+06 2.6203e+10 22487
## - regionnorthwest  1 1.2624e+07 2.6209e+10 22488
## <none>                          2.6197e+10 22489
## - sexmale          1 8.1663e+07 2.6278e+10 22491
## - regionsoutheast  1 1.0969e+08 2.6306e+10 22493
## - regionsouthwest  1 2.4036e+08 2.6437e+10 22499
## - bmi              1 2.4237e+08 2.6439e+10 22499
## - age2             1 4.9392e+08 2.6691e+10 22512
## - children         1 8.1138e+08 2.7008e+10 22528
## - bmi30:smokeryes  1 2.1207e+10 4.7404e+10 23281
## 
## Step:  AIC=22487.27
## charges ~ age2 + children + bmi + sexmale + bmi30 + smokeryes + 
##     regionnorthwest + regionsouthwest + regionsoutheast + bmi30:smokeryes
## 
##                   Df  Sum of Sq        RSS   AIC
## - regionnorthwest  1 1.2495e+07 2.6215e+10 22486
## <none>                          2.6203e+10 22487
## - sexmale          1 8.1315e+07 2.6284e+10 22489
## - regionsoutheast  1 1.0932e+08 2.6312e+10 22491
## - bmi              1 2.3974e+08 2.6442e+10 22498
## - regionsouthwest  1 2.4036e+08 2.6443e+10 22498
## - children         1 8.4710e+08 2.7050e+10 22528
## - age2             1 1.8466e+10 4.4668e+10 23199
## - bmi30:smokeryes  1 2.1206e+10 4.7408e+10 23279
## 
## Step:  AIC=22485.91
## charges ~ age2 + children + bmi + sexmale + bmi30 + smokeryes + 
##     regionsouthwest + regionsoutheast + bmi30:smokeryes
## 
##                   Df  Sum of Sq        RSS   AIC
## <none>                          2.6215e+10 22486
## - sexmale          1 8.1065e+07 2.6296e+10 22488
## - regionsoutheast  1 1.0039e+08 2.6315e+10 22489
## - bmi              1 2.4091e+08 2.6456e+10 22496
## - regionsouthwest  1 2.5142e+08 2.6466e+10 22497
## - children         1 8.4178e+08 2.7057e+10 22526
## - age2             1 1.8469e+10 4.4684e+10 23197
## - bmi30:smokeryes  1 2.1219e+10 4.7434e+10 23277
summary(ins_model2_step)
## 
## Call:
## lm(formula = charges ~ age2 + children + bmi + sexmale + bmi30 + 
##     smokeryes + regionsouthwest + regionsoutheast + bmi30:smokeryes, 
##     data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17408.6  -1663.5  -1247.8   -746.3  24246.0 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -558.5302   902.7870  -0.619 0.536238    
## age2                3.3284     0.1088  30.588  < 2e-16 ***
## children          658.7696   100.8813   6.530 9.33e-11 ***
## bmi               119.5555    34.2231   3.493 0.000493 ***
## sexmale          -494.9813   244.2577  -2.026 0.042916 *  
## bmi30            -991.1961   421.7661  -2.350 0.018914 *  
## smokeryes       13407.6105   439.7533  30.489  < 2e-16 ***
## regionsouthwest -1083.4730   303.5968  -3.569 0.000371 ***
## regionsoutheast  -688.6991   305.3897  -2.255 0.024286 *  
## bmi30:smokeryes 19815.1591   604.3778  32.786  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4443 on 1328 degrees of freedom
## Multiple R-squared:  0.8663, Adjusted R-squared:  0.8654 
## F-statistic: 956.1 on 9 and 1328 DF,  p-value: < 2.2e-16

定義距離

a <- c(2,3)
b <- c(7,8)

x <- c(2,7)
y <- c(3,8)

plot(x, y, col='red', pch = 19, xlim =c(0,10), ylim =c(0,10))
lines(x, y, col='blue')

sqrt(((7 - 2)^ 2  + (8 - 3) ^ 2))
## [1] 7.071068
sqrt(sum((b - a) ^ 2))
## [1] 7.071068
x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)

sqrt(sum((x - y) ^ 2))
## [1] 1.414214
x <- c(2,7)
y <- c(3,8)

plot(x, y, col='red', pch = 19, xlim =c(0,10), ylim =c(0,10))
# (2,3) -> (7, 3) -> (7,8) 
lines(c(2,7,7), c(3,3,8), col='blue')

abs(7 - 2) + abs(8 - 3)
## [1] 10
a <- c(2,3)
b <- c(7,8)
sum(abs(b - a))
## [1] 10
x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method = "euclidean")
##          x
## y 1.414214
dist(rbind(x,y), method ="minkowski", p=2)
##          x
## y 1.414214
dist(rbind(x,y), method = "manhattan")
##   x
## y 2
dist(rbind(x,y), method ="minkowski", p=1)
##   x
## y 2

階層式分群

data(iris)

hc <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'ward.D2')

plot(hc)

fit <- cutree(hc, k = 3)
table(fit)
## fit
##  1  2  3 
## 50 64 36
plot(hc, hang=-0.1, cex = 0.7)
rect.hclust(hc, k = 3, border='red')

par(mfrow = c(1,2))
plot(Petal.Length~ Petal.Width, col = Species, data = iris, main = 'Real Answer')
plot(Petal.Length~ Petal.Width, col = fit, data = iris, main='Clustered Result')

scale(iris[1:3,-5], center = FALSE)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1    0.8493514   0.8820571    0.8359140   0.8164966
## 2    0.8160435   0.7560490    0.8359140   0.8164966
## 3    0.7827356   0.8064522    0.7762059   0.8164966
## attr(,"scaled:scale")
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##     6.004582     3.967997     1.674813     0.244949
dist(iris[1:3,-5])
##           1         2
## 2 0.5385165          
## 3 0.5099020 0.3000000
dist(scale(iris[1:3,-5], center = FALSE))
##            1          2
## 2 0.13033600           
## 3 0.11712740 0.08494097
# 柯文哲選情加分?丁、姚都說考驗在未來
# 評總統兩岸談話柯文哲:兩霸權對抗台灣要慎思
# 兩岸論述反覆遭疑誠信柯文哲:我是全國最有誠信的

# 柯文哲 選情 加分 考驗 未來 總統 兩岸 談話 台灣 全國
#   1     1    1    1    1    0     0    0    0   0
#   1     0    0    0    0    1     1    1    1   0
#   1     0    0    0    0    0     1    0    0   0



hc <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'ward.D2')

hc1 <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'single')

hc2 <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'complete')

par(mfrow=c(1,3))
plot(hc)
plot(hc1)
plot(hc2)

fit <- cutree(hc, k = 3)
fit1 <- cutree(hc1, k = 3)
fit2 <- cutree(hc2, k = 3)


par(mfrow = c(1,3))
plot(Petal.Length~ Petal.Width, col = fit, data = iris, main='Clustered Result')
plot(Petal.Length~ Petal.Width, col = fit1, data = iris, main='Clustered Result1')
plot(Petal.Length~ Petal.Width, col = fit2, data = iris, main='Clustered Result2')