code 2.1

csv 파일을 읽고 쓰는 연습

read.csv("F:\\cloud\\2014년 여름방학\\project - 0book\\주석\\2-1.csv")         #csv 파일을 읽어온다

##        Name Sex Age Height Weight
## 1  알프레드  남  14   69.0  112.5
## 2    앨리스  여  13   56.5   84.0
## 3    바바라  여  13   65.3   98.0
## 4      캐롤  여  14   62.8  102.5
## 5      헨리  남  14   63.5  102.5
## 6    제임스  남  12   57.3   83.0
## 7      제인  여  12   59.8   84.5
## 8      자넷  여  15   62.5  112.5
## 9    제프리  남  13   62.5   84.0
## 10       존  남  12   59.0   99.5
## 11   조이스  여  11   51.3   50.5
## 12     주디  여  14   64.3   90.0
## 13   루이스  여  12   56.3   77.0
## 14     메리  여  15   66.5  112.0
## 15     필립  남  16   72.0  150.0
## 16   로버트  남  12   64.8  128.0
## 17   로날드  남  15   67.0  133.0
## 18   토마스  남  11   57.5   85.0
## 19   윌리엄  남  15   66.5  112.0

class<-read.csv("F:\\cloud\\2014년 여름방학\\project - 0book\\주석\\2-1.csv")  #csv 파일을 읽어서 class라는 데이터프레임에 저장

code 2.2

요약표를 만들어 보자.

class               #자료명을 실행시키면 내용을 보여준다

##        Name Sex Age Height Weight
## 1  알프레드  남  14   69.0  112.5
## 2    앨리스  여  13   56.5   84.0
## 3    바바라  여  13   65.3   98.0
## 4      캐롤  여  14   62.8  102.5
## 5      헨리  남  14   63.5  102.5
## 6    제임스  남  12   57.3   83.0
## 7      제인  여  12   59.8   84.5
## 8      자넷  여  15   62.5  112.5
## 9    제프리  남  13   62.5   84.0
## 10       존  남  12   59.0   99.5
## 11   조이스  여  11   51.3   50.5
## 12     주디  여  14   64.3   90.0
## 13   루이스  여  12   56.3   77.0
## 14     메리  여  15   66.5  112.0
## 15     필립  남  16   72.0  150.0
## 16   로버트  남  12   64.8  128.0
## 17   로날드  남  15   67.0  133.0
## 18   토마스  남  11   57.5   85.0
## 19   윌리엄  남  15   66.5  112.0

table(class$Sex)        #가장 간단한 요약표

## 
## 남 여 
## 10  9

table(class$Sex, class$Age) #2x2 교차표 만들기

##     
##      11 12 13 14 15 16
##   남  1  3  1  2  2  1
##   여  1  2  2  2  2  0

ftable(table(class$Sex, class$Age, class$Name)) #연속되는 교차표는 ftable 사용

##        로날드 로버트 루이스 메리 바바라 알프레드 앨리스 윌리엄 자넷 제인 제임스 제프리 조이스 존 주디 캐롤 토마스 필립 헨리
##                                                                                                                            
## 남 11       0      0      0    0      0        0      0      0    0    0      0      0      0  0    0    0      1    0    0
##    12       0      1      0    0      0        0      0      0    0    0      1      0      0  1    0    0      0    0    0
##    13       0      0      0    0      0        0      0      0    0    0      0      1      0  0    0    0      0    0    0
##    14       0      0      0    0      0        1      0      0    0    0      0      0      0  0    0    0      0    0    1
##    15       1      0      0    0      0        0      0      1    0    0      0      0      0  0    0    0      0    0    0
##    16       0      0      0    0      0        0      0      0    0    0      0      0      0  0    0    0      0    1    0
## 여 11       0      0      0    0      0        0      0      0    0    0      0      0      1  0    0    0      0    0    0
##    12       0      0      1    0      0        0      0      0    0    1      0      0      0  0    0    0      0    0    0
##    13       0      0      0    0      1        0      1      0    0    0      0      0      0  0    0    0      0    0    0
##    14       0      0      0    0      0        0      0      0    0    0      0      0      0  0    1    1      0    0    0
##    15       0      0      0    1      0        0      0      0    1    0      0      0      0  0    0    0      0    0    0
##    16       0      0      0    0      0        0      0      0    0    0      0      0      0  0    0    0      0    0    0

##만일 특정 셀에 각 빈도(가중치)가 저장되어있는 경우 xtabs를 사용

xtabs(Age ~ Name + Sex, data=class)

##           Sex
## Name       남 여
##   로날드   15  0
##   로버트   12  0
##   루이스    0 12
##   메리      0 15
##   바바라    0 13
##   알프레드 14  0
##   앨리스    0 13
##   윌리엄   15  0
##   자넷      0 15
##   제인      0 12
##   제임스   12  0
##   제프리   13  0
##   조이스    0 11
##   존       12  0
##   주디      0 14
##   캐롤      0 14
##   토마스   11  0
##   필립     16  0
##   헨리     14  0

##편리한 자료운용 패키지 plyr

library(plyr)       #plyr 패키지를 불러온다. 없을경우, install.packages("")명령으로 설치

m0<-ddply(class, c("Sex"), summarise, freq=length(Sex))
#성별"Sex"에 따라 갯수를 세서 freq에 저장하고 요약(summarise)하는 데이터프레임을 생성하고
#이를 m0에 저장한다

m0$per<-round(m0$freq/sum(m0$freq),4)*100       #성별 비율을 구한다
m0

##   Sex freq   per
## 1  남   10 52.63
## 2  여    9 47.37

temp<-0
result<-c()
for (i in 1:nrow(m0))               #누적빈도를 구한다
{
    temp<-temp+m0$freq[i]
    result<-append(result, temp)
}
m0$cum<-result
m0$cum.per<-round(m0$cum/sum(m0$freq),4)*100
m0

##   Sex freq   per cum cum.per
## 1  남   10 52.63  10   52.63
## 2  여    9 47.37  19  100.00

code 2.3

백분율이 들어가있는 테이블을 만들어 보자.

table(class$Sex, class$Age)

##     
##      11 12 13 14 15 16
##   남  1  3  1  2  2  1
##   여  1  2  2  2  2  0

prop.table(table(class$Sex, class$Age)) #전체 백분율

##     
##              11         12         13         14         15         16
##   남 0.05263158 0.15789474 0.05263158 0.10526316 0.10526316 0.05263158
##   여 0.05263158 0.10526316 0.10526316 0.10526316 0.10526316 0.00000000

prop.table(table(class$Sex, class$Age),1)   #열방향 백분율

##     
##             11        12        13        14        15        16
##   남 0.1000000 0.3000000 0.1000000 0.2000000 0.2000000 0.1000000
##   여 0.1111111 0.2222222 0.2222222 0.2222222 0.2222222 0.0000000

prop.table(table(class$Sex, class$Age),2)   #행방향 백분율

##     
##             11        12        13        14        15        16
##   남 0.5000000 0.6000000 0.3333333 0.5000000 0.5000000 1.0000000
##   여 0.5000000 0.4000000 0.6666667 0.5000000 0.5000000 0.0000000

#각 빈도/셀-열-행 백분율을 구한다
t1<-round(table(class$Sex, class$Age),0)                
t2<-round(prop.table(table(class$Sex, class$Age))*100,2)
t3<-round(prop.table(table(class$Sex, class$Age), 1)*100,2)
t4<-round(prop.table(table(class$Sex, class$Age), 2)*100,2)

tt<-rbind(t1, t2, t3, t4)
tt

##       11    12    13    14    15     16
## 남  1.00  3.00  1.00  2.00  2.00   1.00
## 여  1.00  2.00  2.00  2.00  2.00   0.00
## 남  5.26 15.79  5.26 10.53 10.53   5.26
## 여  5.26 10.53 10.53 10.53 10.53   0.00
## 남 10.00 30.00 10.00 20.00 20.00  10.00
## 여 11.11 22.22 22.22 22.22 22.22   0.00
## 남 50.00 60.00 33.33 50.00 50.00 100.00
## 여 50.00 40.00 66.67 50.00 50.00   0.00

apply(tt, 2, sum)   #열합

##     11     12     13     14     15     16 
## 133.63 183.54 151.01 167.28 167.28 116.26

apply(tt, 1, sum)   #행합

##     남     여     남     여     남     여     남     여 
##  10.00   9.00  52.63  47.38 100.00  99.99 343.33 256.67

data.frame(t1) ##table 저장하면 아주 재밌는 형식의 데이터프레임이 만들어짐

##    Var1 Var2 Freq
## 1    남   11    1
## 2    여   11    1
## 3    남   12    3
## 4    여   12    2
## 5    남   13    1
## 6    여   13    2
## 7    남   14    2
## 8    여   14    2
## 9    남   15    2
## 10   여   15    2
## 11   남   16    1
## 12   여   16    0

code 2.4

히스토그램을 그려 보자.

hist(class$Height)          #그냥 디폴트로 히스토그램

hist(class$Height, freq=F)      #확률밀도함수

hist(class$Height, 3)

xx<-hist(class$Height, breaks=c(50, 55, 60, 65, 70, 75)) #원하는 구간을 벡터로 끊어 준다

xx      #xx에 내가 그리고자 하는 히스토그램의 정보가 저장된다.

## $breaks
## [1] 50 55 60 65 70 75
## 
## $counts
## [1] 1 6 6 5 1
## 
## $density
## [1] 0.01052632 0.06315789 0.06315789 0.05263158 0.01052632
## 
## $mids
## [1] 52.5 57.5 62.5 67.5 72.5
## 
## $xname
## [1] "class$Height"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

xx<-hist(class$Height, breaks=c(50, 55, 60, 65, 70, 75), 
    main="히스토그램", xlab="키", ylab="빈도", )

type<-c("매우 작음", "작음", "보통", "큼", "매우 큼")

hist.result<-data.frame(type, xx$mids, xx$counts)

hist.result

##        type xx.mids xx.counts
## 1 매우 작음    52.5         1
## 2      작음    57.5         6
## 3      보통    62.5         6
## 4        큼    67.5         5
## 5   매우 큼    72.5         1

?hist

## starting httpd help server ... done

plot(hist.result$xx.counts)

##레이블을 박고 싶을때는 이쪽으로 가져와서 이름을 박는다
kk<-barplot(hist.result$xx.counts, names=hist.result$type)

code 2.6 기하평균 생략

code 2.7

절사평균을 구해보자.

mean(class$Height, trim=0.05) #trim은 각 극단에서 날리는 %. 0.05면 양쪽에서 0.05날린다.

## [1] 62.33684

code 2.8

히스토그램을 원하는대로 그려 보자.

xxx<-hist(class$Height, breaks=seq(45,79,2), freq=FALSE)
lines(density(class$Height))

code 2.9

그림을 그려보자. 편리한 그림 패키지ggplot2

library(ggplot2)

ggplot(class, aes(x=Height)) + geom_histogram(binwidth=5)

ggplot(class, aes(x=Height)) + geom_histogram(binwidth=5, fill="white", color="black")

ggplot(class, aes(x=Height, fill=Sex)) + geom_histogram(binwidth=2, alpha=0.4)

ggplot(class, aes(x=Height, fill=Sex)) + geom_density(alpha=0.4)

class

##        Name Sex Age Height Weight
## 1  알프레드  남  14   69.0  112.5
## 2    앨리스  여  13   56.5   84.0
## 3    바바라  여  13   65.3   98.0
## 4      캐롤  여  14   62.8  102.5
## 5      헨리  남  14   63.5  102.5
## 6    제임스  남  12   57.3   83.0
## 7      제인  여  12   59.8   84.5
## 8      자넷  여  15   62.5  112.5
## 9    제프리  남  13   62.5   84.0
## 10       존  남  12   59.0   99.5
## 11   조이스  여  11   51.3   50.5
## 12     주디  여  14   64.3   90.0
## 13   루이스  여  12   56.3   77.0
## 14     메리  여  15   66.5  112.0
## 15     필립  남  16   72.0  150.0
## 16   로버트  남  12   64.8  128.0
## 17   로날드  남  15   67.0  133.0
## 18   토마스  남  11   57.5   85.0
## 19   윌리엄  남  15   66.5  112.0

#막대그래프와 error bar를 같이 그림(양자의 조합은 권장하지 않음)
ebg1<-ddply(class, c("Sex"), summarise, avr.height=mean(Height))
ebg2<-ddply(class, c("Sex"), summarise, sd.height=sd(Height))
ebg3<-ddply(class, c("Sex"), summarise, n=length(Sex))

ebg1

##   Sex avr.height
## 1  남   63.91000
## 2  여   60.58889

ebg2

##   Sex sd.height
## 1  남  4.937937
## 2  여  5.018328

ebg3

##   Sex  n
## 1  남 10
## 2  여  9

ebg1$sd.height<-ebg2$sd.height
ebg1$n<-ebg3$n
ebg1$se<-ebg1$sd.height/(ebg1$n)^.5


ggplot(ebg1, aes(x=Sex, y=avr.height)) + 
        geom_bar(fill="white", color="black", stat="identity") +
        geom_errorbar(aes(ymin=avr.height-se*1.65, ymax=avr.height+se*1.65), width=0.2) +
        geom_text(aes(label=round(avr.height,2)), hjust=-1, size=4) +
        geom_hline(yintercept=mean(class$Height)) +
        annotate("text", x=1.5, y=mean(class$Height),label=round(mean(class$Height),2))

#error bar와의 조합은 dot graph가 합리적
ggplot(ebg1, aes(x=Sex, y=avr.height)) + 
        geom_point(size=4) +
        geom_errorbar(aes(ymin=avr.height-se*1.65, ymax=avr.height+se*1.65), width=0.2) +
        geom_text(aes(label=round(avr.height,2)), hjust=-1, size=4) +
        geom_hline(yintercept=mean(class$Height)) +
        annotate("text", x=1.5, y=mean(class$Height),label=round(mean(class$Height),2))

code 2. 10 성별 산점도와 회귀직선

ggplot(class, aes(x=Height, y=Weight, color=Sex)) +
        geom_point() +
        stat_smooth(method=lm)

ggplot(class, aes(x=Height, y=Weight, color=Sex)) +
        geom_point() +
        stat_smooth(method=lm, se=FALSE) +
        geom_text(aes(label=Name), vjust=0.2, hjust=-0.2, size=3)