bigdata_part03

#exam7)다음은 "PimaIndiansDiabetes"에서 age를 1:20~40,2:41:60,3:60세 이상으로 범주형 변수로 변환 후에
# 가장 높은 연령 범주의 발병률을 출력하시오. 단 발병률는 diabetes 중 pos수/연령 범주별*100이다.

library(mlbench)

## Warning: 패키지 'mlbench'는 R 버전 4.1.3에서 작성되었습니다

library(dplyr)

## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data("PimaIndiansDiabetes")
pima<-PimaIndiansDiabetes
pima %>% glimpse

## Rows: 768
## Columns: 9
## $ pregnant <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0, 7, 1, 1~
## $ glucose  <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 168, 139,~
## $ pressure <dbl> 72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74, 80, 60, 72, 0,~
## $ triceps  <dbl> 35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, 23, 19, 0, 47, 0~
## $ insulin  <dbl> 0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, 846, 175, 0, 230~
## $ mass     <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, 0.0, 37~
## $ pedigree <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, 0.158~
## $ age      <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, 51, 3~
## $ diabetes <fct> pos, neg, pos, neg, pos, neg, pos, neg, pos, pos, neg, pos, n~

colSums(is.na(pima))

## pregnant  glucose pressure  triceps  insulin     mass pedigree      age 
##        0        0        0        0        0        0        0        0 
## diabetes 
##        0

pima %>% mutate(age_class=ifelse(age>=60,"3",
                                ifelse(age>=41,"2","1")))->pima1
# ifelse missing error,) error
table(pima1$age_class)

## 
##   1   2   3 
## 574 162  32

pima1 %>% group_by(age_class) %>% summarize(n=n(),fre=sum(diabetes=='pos')) %>% 
  mutate(ill_rate=fre/n*100)->df
df

## # A tibble: 3 x 4
##   age_class     n   fre ill_rate
##   <chr>     <int> <int>    <dbl>
## 1 1           574   166     28.9
## 2 2           162    93     57.4
## 3 3            32     9     28.1

print(df$ill_rate[2])

## [1] 57.40741

bigdata_part03_07

kim kye chul

2022 5 22