Visualization
with
를 쓰지 않고 hist(custdata$age, prob=TRUE)
로 하면 어느 요소가 어떻게 달라지는가?
with(custdata, hist(age, prob=TRUE))

with(custdata, hist(age, prob=TRUE))
with(custdata, lines(density(age), col="red"))

library(ggplot2)
(g1 <- ggplot(custdata, aes(x=age)) + geom_density())

- 도표 안에 텍스트를 추가하려면,
annotate()
사용
(g2 <- g1 + annotate("text", x=20, y=0.02, label="50세 이전에\n 많은 관찰값", family="HCR Batang LVT"))

(g3 <- g2 + annotate("text", x=130, y=0.002, label="100세 이상에서\n 관찰되는 이상점?", family="HCR Batang LVT"))

(g4 <- g3 + annotate("text", x=0, y=0.002, label="0세의\n 고객?", family="HCR Batang LVT"))

(g5 <- g4 + annotate("text", x=90, y=0.01, label="75세 이후\n예상을 넘는\n많은고객", family="HCR Batang LVT"))

- 기초통계를 파악하는 데는
summary()
가 낫다는 기술에 대해서. 적어도 분위수에 관한 한 ecdf
가 시각적으로 우수함.
(g.ecdf <- ggplot(custdata, aes(x=age)) + stat_ecdf())

- 게다가 도표 윗 부분은 바로 평균이라는 점을 기억해 두어야 할 것임. 좀 복잡해 보이지만,
geom_polygon()
을 이용하기 위해서는 다각형을 나타내는 좌표를 data frame으로 갖춰야 함.
poly.x <- c(sort(custdata$age), sort(custdata$age)[1])
poly.y <- c((1:length(custdata$age))/length(custdata$age), 1)
poly.age <- data.frame(x=poly.x, y=poly.y)
plot(y ~ x, data=poly.age, type="b")

ggplot(poly.age, aes(x=x, y=y)) + geom_path()

geom_polygon()
에 alpha
로 조정. 색은 fill
로 설정.
(p <- ggplot(poly.age, aes(x=x, y=y)) + geom_polygon(alpha=0.5))

(p1<- ggplot(poly.age, aes(x=x, y=y)) + geom_polygon(fill="cyan", alpha=0.5))

(p2 <- p1 + xlab("Age") + ylab("Empirical CDF"))

(p3 <- p2 + annotate("text", x=32, y=0.8, label="The area above the curve\n is the \"mean\""))

(p4 <- p3 + annotate("text", x=100, y=0.5, label="분위수 뿐 아니라\n 평균 비교도 가능", family="HCR Dotum LVT", colour="red"))

- 히스토그램으로 요약하기. 각각의 차이가 어디서 비롯되는지 이해할 것.
ggplot(custdata, aes(x=age)) + geom_histogram(binwidth=5)

ggplot(custdata, aes(x=age)) + geom_histogram(binwidth=5, fill="gray") +
annotate("text", x=80, y=60, label="fill=\"gray\"", colour="red")

ggplot(custdata, aes(x=age)) + geom_histogram(binwidth=5, alpha=0.5) +
annotate("text", x=80, y=60, label="alpha=0.5", colour="red") +
annotate("text", x=125, y=10, label="Outliers") +
annotate("text", x=0, y=10, label="Invalid\nvalues")

prob = TRUE
에 해당하는 ggplot의 설정은?
ggplot(custdata, aes(x=age, y=..density..)) + geom_histogram(binwidth=5)

ggplot(custdata, aes(x=age)) + geom_histogram(aes(y=..density..), binwidth=5, fill="gray") +
annotate("text", x=90, y=0.01, label="fill=\"gray\"", colour="red")

ggplot(custdata, aes(x=age)) + geom_histogram(aes(y=..density..), binwidth=5, alpha=0.5) +
annotate("text", x=90, y=0.01, label="alpha=0.5", colour="red") +
annotate("text", x=125, y=0.002, label="Outliers") +
annotate("text", x=0, y=0.002, label="Invalid\nvalues")

library(scales)
ggplot(custdata) + geom_density(aes(x=income)) +
scale_x_continuous(labels=dollar) +
annotate("text", x=150000, y=0.00001, label="대부분의 분포는\n 10만불 이하에 집중", family="HCR Dotum LVT", colour="red") +
annotate("text", x=400000, y=0.0000015, label="40만불 대의\n 부유층\n 고객 집단", family="HCR Dotum LVT", colour="red") +
annotate("text", x=550000, y=0.0000015, label="매우 넓은\n 소득 분포,\n 수십 배의 격차", family="HCR Dotum LVT", colour="red")

- Density plots on log-scale. 왜
warning=FALSE
를 켜 놓았는지 확인해 볼 것.
ggplot(custdata) + geom_density(aes(x=income)) +
scale_x_log10(breaks=c(100, 1000, 10000, 100000), labels=dollar) +
annotation_logticks(side="bt") +
annotate("text", x=150, y=0.05, label="극히 소득이 적은 이상점", family="HCR Dotum LVT", colour="red") +
annotate("text", x=3000, y=0.4, label="예상을 넘는\n 1만불 대의\n 소득자들", family="HCR Dotum LVT", colour="red") +
annotate("text", x=4000, y=0.7, label="대부분의 고객은\n 2만불-10만불 수준", family="HCR Dotum LVT", colour="red") +
annotate("text", x=8000, y=0.9, label="소득분포의 정점은\n 4만불 대에", family="HCR Dotum LVT", colour="red") +
annotate("text", x=400000, y=0.4, label="20만불\n 이상은\n 드물지만\n이상점으로\n 보이지는\n 않음", family="HCR Dotum LVT", colour="red")

ggplot(custdata, aes(x=marital.stat)) + geom_bar(fill="gray")

- Bar Charts for
state.of.res
ggplot(custdata, aes(x=state.of.res)) + geom_bar(fill="gray") +
coord_flip() +
theme(axis.text.y=element_text(size=rel(0.8)))

- 등록된 거주자 수효대로 각 주를 정렬시키려면,
reorder()
가 필요함. 교재의 방법을 따르면 다음과 같이 할 수 있음.
(sor.tbl <- with(custdata, table(state.of.res)))
## state.of.res
## Alabama Alaska Arizona Arkansas California Colorado Connecticut Delaware
## 11 3 9 7 100 11 14 1
## Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
## 49 27 5 4 49 29 10 4
## Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota Mississippi
## 16 15 6 17 24 52 21 7
## Missouri Montana Nebraska Nevada New Hampshire New Jersey New Mexico New York
## 21 3 8 4 5 39 6 71
## North Carolina North Dakota Ohio Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 16 1 51 11 7 70 2 15
## South Dakota Tennessee Texas Utah Vermont Virginia Washington West Virginia
## 5 22 56 5 3 29 17 13
## Wisconsin Wyoming
## 28 1
(sor.df <- data.frame(sor.tbl))
## state.of.res Freq
## 1 Alabama 11
## 2 Alaska 3
## 3 Arizona 9
## 4 Arkansas 7
## 5 California 100
## 6 Colorado 11
## 7 Connecticut 14
## 8 Delaware 1
## 9 Florida 49
## 10 Georgia 27
## 11 Hawaii 5
## 12 Idaho 4
## 13 Illinois 49
## 14 Indiana 29
## 15 Iowa 10
## 16 Kansas 4
## 17 Kentucky 16
## 18 Louisiana 15
## 19 Maine 6
## 20 Maryland 17
## 21 Massachusetts 24
## 22 Michigan 52
## 23 Minnesota 21
## 24 Mississippi 7
## 25 Missouri 21
## 26 Montana 3
## 27 Nebraska 8
## 28 Nevada 4
## 29 New Hampshire 5
## 30 New Jersey 39
## 31 New Mexico 6
## 32 New York 71
## 33 North Carolina 16
## 34 North Dakota 1
## 35 Ohio 51
## 36 Oklahoma 11
## 37 Oregon 7
## 38 Pennsylvania 70
## 39 Rhode Island 2
## 40 South Carolina 15
## 41 South Dakota 5
## 42 Tennessee 22
## 43 Texas 56
## 44 Utah 5
## 45 Vermont 3
## 46 Virginia 29
## 47 Washington 17
## 48 West Virginia 13
## 49 Wisconsin 28
## 50 Wyoming 1
names(sor.df)[2] <- "count"
str(sor.df)
## 'data.frame': 50 obs. of 2 variables:
## $ state.of.res: Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ count : int 11 3 9 7 100 11 14 1 49 27 ...
str(reorder(sor.df$state.of.res, sor.df$count))
## Factor w/ 50 levels "Delaware","North Dakota",..: 23 5 21 17 50 24 27 1 43 38 ...
## - attr(*, "scores")= num [1:50(1d)] 11 3 9 7 100 11 14 1 49 27 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
sor.df.o <- transform(sor.df, state.of.res=reorder(state.of.res, count))
str(sor.df.o)
## 'data.frame': 50 obs. of 2 variables:
## $ state.of.res: Factor w/ 50 levels "Delaware","North Dakota",..: 23 5 21 17 50 24 27 1 43 38 ...
## ..- attr(*, "scores")= num [1:50(1d)] 11 3 9 7 100 11 14 1 49 27 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ count : int 11 3 9 7 100 11 14 1 49 27 ...
ggplot(sor.df.o, aes(x=state.of.res, y=count)) + geom_bar(stat="identity", fill="gray") +
coord_flip() +
theme(axis.text.y=element_text(size=rel(0.8)))

- 굳이
transform()
까지 사용하지 않더라도, sor.df
만 가지고도 원하는 작업은 할 수 있음.
(sor.df.2 <- data.frame(sor.tbl))
## state.of.res Freq
## 1 Alabama 11
## 2 Alaska 3
## 3 Arizona 9
## 4 Arkansas 7
## 5 California 100
## 6 Colorado 11
## 7 Connecticut 14
## 8 Delaware 1
## 9 Florida 49
## 10 Georgia 27
## 11 Hawaii 5
## 12 Idaho 4
## 13 Illinois 49
## 14 Indiana 29
## 15 Iowa 10
## 16 Kansas 4
## 17 Kentucky 16
## 18 Louisiana 15
## 19 Maine 6
## 20 Maryland 17
## 21 Massachusetts 24
## 22 Michigan 52
## 23 Minnesota 21
## 24 Mississippi 7
## 25 Missouri 21
## 26 Montana 3
## 27 Nebraska 8
## 28 Nevada 4
## 29 New Hampshire 5
## 30 New Jersey 39
## 31 New Mexico 6
## 32 New York 71
## 33 North Carolina 16
## 34 North Dakota 1
## 35 Ohio 51
## 36 Oklahoma 11
## 37 Oregon 7
## 38 Pennsylvania 70
## 39 Rhode Island 2
## 40 South Carolina 15
## 41 South Dakota 5
## 42 Tennessee 22
## 43 Texas 56
## 44 Utah 5
## 45 Vermont 3
## 46 Virginia 29
## 47 Washington 17
## 48 West Virginia 13
## 49 Wisconsin 28
## 50 Wyoming 1
ggplot(sor.df.2, aes(x=reorder(state.of.res, Freq), y=Freq)) + geom_bar(stat="identity", fill="gray") +
coord_flip() +
theme(axis.text.y=element_text(size=rel(0.8))) +
xlab("State of Residence") + ylab("Count")

정리
save.image(file="chapter_3_1_0912.rda")