分析醫療費用
讀取資料
insurance <- read.csv('https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/insurance.csv')
class(insurance)
## [1] "data.frame"
str(insurance)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
## $ charges : num 16885 1726 4449 21984 3867 ...
head(insurance)
## age sex bmi children smoker region charges
## 1 19 female 27.900 0 yes southwest 16884.924
## 2 18 male 33.770 1 no southeast 1725.552
## 3 28 male 33.000 3 no southeast 4449.462
## 4 33 male 22.705 0 no northwest 21984.471
## 5 32 male 28.880 0 no northwest 3866.855
## 6 31 female 25.740 0 no southeast 3756.622
探索資料
boxplot(charges ~ smoker, data = insurance)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4

ggplot(insurance, aes(x=smoker, y= charges, fill=smoker)) + geom_boxplot()

plot(charges ~ age, data = insurance, col= smoker)

ggplot(insurance, aes(x=age, y= charges)) + geom_point(aes(colour=smoker))

## smokers pays more
plot(charges ~ bmi, data = insurance, col=smoker, pch = 19)
abline(v = 30, col = 'blue', lwd = 3)

ggplot(insurance, aes(x=bmi, y= charges)) + geom_point(aes(colour=smoker))

insurance$bmi_bin <- ifelse(insurance$bmi > 30, 'yes', 'no')
insurance$bmi_bin <- as.factor(insurance$bmi_bin)
boxplot(charges ~ bmi_bin, data = insurance)

ggplot(insurance, aes(x=bmi_bin, y= charges, fill=bmi_bin)) + geom_boxplot()

## High bmi patients pays more
plot(charges ~ age, data = insurance, pch = 19)
fit1 <- lm(charges ~ age, data = insurance)
abline(fit1, col = 'red')

fit2 <- lm(charges ~ poly(age,2), data = insurance)
predicted <- predict(fit2, insurance)
plot(charges ~ age, data = insurance, pch = 19)
lines(insurance$age[order(insurance$age)], predicted[order(insurance$age)], col ='red')

## old people pays more
資料預處理
head(insurance)
## age sex bmi children smoker region charges bmi_bin
## 1 19 female 27.900 0 yes southwest 16884.924 no
## 2 18 male 33.770 1 no southeast 1725.552 yes
## 3 28 male 33.000 3 no southeast 4449.462 yes
## 4 33 male 22.705 0 no northwest 21984.471 no
## 5 32 male 28.880 0 no northwest 3866.855 no
## 6 31 female 25.740 0 no southeast 3756.622 no
# remove bmi_bin
insurance$bmi_bin <- NULL
## creaet dummy variable
table(insurance$region)
##
## northeast northwest southeast southwest
## 324 325 364 325
insurance$regionnorthwest <- ifelse(insurance$region == 'northwest', 1, 0)
insurance$regionsoutheast <- ifelse(insurance$region == 'southeast', 1, 0)
insurance$regionsouthwest<- ifelse(insurance$region == 'southwest', 1, 0)
insurance$region <- NULL
# create sexmale
insurance$sexmale <- ifelse(insurance$sex == 'male', 1, 0)
insurance$sex <- NULL
head(insurance)
## age bmi children smoker charges regionnorthwest regionsoutheast
## 1 19 27.900 0 yes 16884.924 0 0
## 2 18 33.770 1 no 1725.552 0 1
## 3 28 33.000 3 no 4449.462 0 1
## 4 33 22.705 0 no 21984.471 1 0
## 5 32 28.880 0 no 3866.855 1 0
## 6 31 25.740 0 no 3756.622 0 1
## regionsouthwest sexmale
## 1 1 0
## 2 0 1
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 0
# create smokeryes
insurance$smokeryes <- ifelse(insurance$smoker == 'yes', 1, 0)
insurance$smoker <- NULL
head(insurance)
## age bmi children charges regionnorthwest regionsoutheast
## 1 19 27.900 0 16884.924 0 0
## 2 18 33.770 1 1725.552 0 1
## 3 28 33.000 3 4449.462 0 1
## 4 33 22.705 0 21984.471 1 0
## 5 32 28.880 0 3866.855 1 0
## 6 31 25.740 0 3756.622 0 1
## regionsouthwest sexmale smokeryes
## 1 1 0 1
## 2 0 1 0
## 3 0 1 0
## 4 0 1 0
## 5 0 1 0
## 6 0 0 0
建立迴歸模型
# method 1
ins_model <- glm(charges ~ ., data = insurance, family = 'gaussian')
# method 2
ins_model <- lm(charges ~ ., data = insurance)
# lm = glm(family= gaussian)
summary(ins_model)
##
## Call:
## lm(formula = charges ~ ., data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11304.9 -2848.1 -982.1 1393.9 29992.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11938.5 987.8 -12.086 < 2e-16 ***
## age 256.9 11.9 21.587 < 2e-16 ***
## bmi 339.2 28.6 11.860 < 2e-16 ***
## children 475.5 137.8 3.451 0.000577 ***
## regionnorthwest -353.0 476.3 -0.741 0.458769
## regionsoutheast -1035.0 478.7 -2.162 0.030782 *
## regionsouthwest -960.0 477.9 -2.009 0.044765 *
## sexmale -131.3 332.9 -0.394 0.693348
## smokeryes 23848.5 413.1 57.723 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared: 0.7509, Adjusted R-squared: 0.7494
## F-statistic: 500.8 on 8 and 1329 DF, p-value: < 2.2e-16
hist(ins_model$residuals)

調整變數
insurance$age2 <- insurance$age ^ 2
insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)
ins_model2 <- lm(charges ~ age + age2 + children + bmi + sexmale +
bmi30*smokeryes + regionnorthwest + regionsouthwest+ regionsoutheast, data = insurance)
summary(ins_model2)
##
## Call:
## lm(formula = charges ~ age + age2 + children + bmi + sexmale +
## bmi30 * smokeryes + regionnorthwest + regionsouthwest + regionsoutheast,
## data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17296.4 -1656.0 -1263.3 -722.1 24160.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 134.2509 1362.7511 0.099 0.921539
## age -32.6851 59.8242 -0.546 0.584915
## age2 3.7316 0.7463 5.000 6.50e-07 ***
## children 678.5612 105.8831 6.409 2.04e-10 ***
## bmi 120.0196 34.2660 3.503 0.000476 ***
## sexmale -496.8245 244.3659 -2.033 0.042240 *
## bmi30 -1000.1403 422.8402 -2.365 0.018159 *
## smokeryes 13404.6866 439.9491 30.469 < 2e-16 ***
## regionnorthwest -279.2038 349.2746 -0.799 0.424212
## regionsouthwest -1222.6437 350.5285 -3.488 0.000503 ***
## regionsoutheast -828.5467 351.6352 -2.356 0.018604 *
## bmi30:smokeryes 19810.7533 604.6567 32.764 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4445 on 1326 degrees of freedom
## Multiple R-squared: 0.8664, Adjusted R-squared: 0.8653
## F-statistic: 781.7 on 11 and 1326 DF, p-value: < 2.2e-16
ins_model2_step <- step(ins_model2)
## Start: AIC=22488.97
## charges ~ age + age2 + children + bmi + sexmale + bmi30 * smokeryes +
## regionnorthwest + regionsouthwest + regionsoutheast
##
## Df Sum of Sq RSS AIC
## - age 1 5.8972e+06 2.6203e+10 22487
## - regionnorthwest 1 1.2624e+07 2.6209e+10 22488
## <none> 2.6197e+10 22489
## - sexmale 1 8.1663e+07 2.6278e+10 22491
## - regionsoutheast 1 1.0969e+08 2.6306e+10 22493
## - regionsouthwest 1 2.4036e+08 2.6437e+10 22499
## - bmi 1 2.4237e+08 2.6439e+10 22499
## - age2 1 4.9392e+08 2.6691e+10 22512
## - children 1 8.1138e+08 2.7008e+10 22528
## - bmi30:smokeryes 1 2.1207e+10 4.7404e+10 23281
##
## Step: AIC=22487.27
## charges ~ age2 + children + bmi + sexmale + bmi30 + smokeryes +
## regionnorthwest + regionsouthwest + regionsoutheast + bmi30:smokeryes
##
## Df Sum of Sq RSS AIC
## - regionnorthwest 1 1.2495e+07 2.6215e+10 22486
## <none> 2.6203e+10 22487
## - sexmale 1 8.1315e+07 2.6284e+10 22489
## - regionsoutheast 1 1.0932e+08 2.6312e+10 22491
## - bmi 1 2.3974e+08 2.6442e+10 22498
## - regionsouthwest 1 2.4036e+08 2.6443e+10 22498
## - children 1 8.4710e+08 2.7050e+10 22528
## - age2 1 1.8466e+10 4.4668e+10 23199
## - bmi30:smokeryes 1 2.1206e+10 4.7408e+10 23279
##
## Step: AIC=22485.91
## charges ~ age2 + children + bmi + sexmale + bmi30 + smokeryes +
## regionsouthwest + regionsoutheast + bmi30:smokeryes
##
## Df Sum of Sq RSS AIC
## <none> 2.6215e+10 22486
## - sexmale 1 8.1065e+07 2.6296e+10 22488
## - regionsoutheast 1 1.0039e+08 2.6315e+10 22489
## - bmi 1 2.4091e+08 2.6456e+10 22496
## - regionsouthwest 1 2.5142e+08 2.6466e+10 22497
## - children 1 8.4178e+08 2.7057e+10 22526
## - age2 1 1.8469e+10 4.4684e+10 23197
## - bmi30:smokeryes 1 2.1219e+10 4.7434e+10 23277
summary(ins_model2_step)
##
## Call:
## lm(formula = charges ~ age2 + children + bmi + sexmale + bmi30 +
## smokeryes + regionsouthwest + regionsoutheast + bmi30:smokeryes,
## data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17408.6 -1663.5 -1247.8 -746.3 24246.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -558.5302 902.7870 -0.619 0.536238
## age2 3.3284 0.1088 30.588 < 2e-16 ***
## children 658.7696 100.8813 6.530 9.33e-11 ***
## bmi 119.5555 34.2231 3.493 0.000493 ***
## sexmale -494.9813 244.2577 -2.026 0.042916 *
## bmi30 -991.1961 421.7661 -2.350 0.018914 *
## smokeryes 13407.6105 439.7533 30.489 < 2e-16 ***
## regionsouthwest -1083.4730 303.5968 -3.569 0.000371 ***
## regionsoutheast -688.6991 305.3897 -2.255 0.024286 *
## bmi30:smokeryes 19815.1591 604.3778 32.786 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4443 on 1328 degrees of freedom
## Multiple R-squared: 0.8663, Adjusted R-squared: 0.8654
## F-statistic: 956.1 on 9 and 1328 DF, p-value: < 2.2e-16
定義距離
a <- c(2,3)
b <- c(7,8)
x <- c(2,7)
y <- c(3,8)
plot(x, y, col='red', pch = 19, xlim =c(0,10), ylim =c(0,10))
lines(x, y, col='blue')

sqrt(((7 - 2)^ 2 + (8 - 3) ^ 2))
## [1] 7.071068
sqrt(sum((b - a) ^ 2))
## [1] 7.071068
x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)
sqrt(sum((x - y) ^ 2))
## [1] 1.414214
x <- c(2,7)
y <- c(3,8)
plot(x, y, col='red', pch = 19, xlim =c(0,10), ylim =c(0,10))
# (2,3) -> (7, 3) -> (7,8)
lines(c(2,7,7), c(3,3,8), col='blue')

abs(7 - 2) + abs(8 - 3)
## [1] 10
a <- c(2,3)
b <- c(7,8)
sum(abs(b - a))
## [1] 10
x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method = "euclidean")
## x
## y 1.414214
dist(rbind(x,y), method ="minkowski", p=2)
## x
## y 1.414214
dist(rbind(x,y), method = "manhattan")
## x
## y 2
dist(rbind(x,y), method ="minkowski", p=1)
## x
## y 2
階層式分群
data(iris)
hc <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'ward.D2')
plot(hc)

fit <- cutree(hc, k = 3)
table(fit)
## fit
## 1 2 3
## 50 64 36
plot(hc, hang=-0.1, cex = 0.7)
rect.hclust(hc, k = 3, border='red')

par(mfrow = c(1,2))
plot(Petal.Length~ Petal.Width, col = Species, data = iris, main = 'Real Answer')
plot(Petal.Length~ Petal.Width, col = fit, data = iris, main='Clustered Result')

scale(iris[1:3,-5], center = FALSE)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 0.8493514 0.8820571 0.8359140 0.8164966
## 2 0.8160435 0.7560490 0.8359140 0.8164966
## 3 0.7827356 0.8064522 0.7762059 0.8164966
## attr(,"scaled:scale")
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 6.004582 3.967997 1.674813 0.244949
dist(iris[1:3,-5])
## 1 2
## 2 0.5385165
## 3 0.5099020 0.3000000
dist(scale(iris[1:3,-5], center = FALSE))
## 1 2
## 2 0.13033600
## 3 0.11712740 0.08494097
# 柯文哲選情加分?丁、姚都說考驗在未來
# 評總統兩岸談話柯文哲:兩霸權對抗台灣要慎思
# 兩岸論述反覆遭疑誠信柯文哲:我是全國最有誠信的
# 柯文哲 選情 加分 考驗 未來 總統 兩岸 談話 台灣 全國
# 1 1 1 1 1 0 0 0 0 0
# 1 0 0 0 0 1 1 1 1 0
# 1 0 0 0 0 0 1 0 0 0
hc <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'ward.D2')
hc1 <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'single')
hc2 <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'complete')
par(mfrow=c(1,3))
plot(hc)
plot(hc1)
plot(hc2)

fit <- cutree(hc, k = 3)
fit1 <- cutree(hc1, k = 3)
fit2 <- cutree(hc2, k = 3)
par(mfrow = c(1,3))
plot(Petal.Length~ Petal.Width, col = fit, data = iris, main='Clustered Result')
plot(Petal.Length~ Petal.Width, col = fit1, data = iris, main='Clustered Result1')
plot(Petal.Length~ Petal.Width, col = fit2, data = iris, main='Clustered Result2')
