DF <- read.csv("example_cancer.csv")
str(DF)
## 'data.frame': 18310 obs. of 8 variables:
## $ age : int 75 52 67 62 70 76 55 72 64 71 ...
## $ sex : Factor w/ 2 levels "남","여": 1 2 2 1 1 2 1 1 1 1 ...
## $ height : Factor w/ 485 levels "100","130","130.2",..: 252 408 182 262 352 352 467 165 172 232 ...
## $ weight : Factor w/ 638 levels "100","101","101.1",..: 333 446 349 263 343 563 463 243 363 248 ...
## $ dateOfoperation: Factor w/ 351 levels "2011-01-02","2011-01-03",..: 165 134 146 164 154 160 164 147 219 192 ...
## $ cancerStaging : Factor w/ 5 levels "I","II","III",..: 1 4 3 1 2 3 2 3 1 2 ...
## $ hospitalization: int 48 17 10 11 10 10 12 18 15 35 ...
## $ diseaseCode : Factor w/ 13 levels "C18","C180","C181",..: 9 9 9 9 4 11 9 4 13 11 ...
#DF$age
DegreeOfAge <- table(cut(DF$age,breaks=(1:11)*10))
DegreeOfAge
##
## (10,20] (20,30] (30,40] (40,50] (50,60] (60,70] (70,80]
## 3 77 482 1917 4558 5679 4598
## (80,90] (90,100] (100,110]
## 962 33 1
#cut(DF$age,breaks=(1:11)*10)
#cut 함수는 breaks 배열에 따라 범주별로 각 값이 어디에 속하는지로 치환하는 함수다.
#구간별로 분포를 파악할 때 큰 도움이 됨.
#table 함수는 데이터프레임을 정리해서 group by 해줘서 보기 편하게 해줌.
rownames(DegreeOfAge) <- paste((1:10)*10,"s",sep="")
DegreeOfAge
##
## 10s 20s 30s 40s 50s 60s 70s 80s 90s 100s
## 3 77 482 1917 4558 5679 4598 962 33 1
#rownames 는 헤더의 이름을 지정하는 함수
#paste 는 문자열을 합치는 함수임. 배열이나 벡터에 적용하면 일괄로 적용됨.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.3.2
ggplot(data=DF, aes(x=age)) + geom_freqpoly(binwidth=10,size=1.4,colour="orange")+theme_wsj()

?ggplot
## starting httpd help server ...
## done
#<http://docs.ggplot2.org/current/> 여기에서 찾아볼것