1. 기술통계

1.1 중심위치의 측정

1.1.1 산술평균

weight=c(72,67,60,78,82)
mean(weight)
## [1] 71.8

1.1.2 결측치가 있는 경우의 평균

  • 1-100까지 난수 생성
set.seed(1234)
myNum=sample(x=1:100,size=100,replace = TRUE) # 1~100까지 난수 생성, replace - 복원추출 여부
mean(myNum) 
## [1] 44.27
  • 20개의 결측치 생성 후 대치 / 결과가 NA로 출력
n=sample(x=1:100,size=20,replace = FALSE)
n # 위치 index 
##  [1]   4  56  28  20  13  31  15  96  41 100  65   9  84  11  19  78  80
## [18]  24  87  90
myNum[n]<-NA  
mean(myNum) # 결측치가 있으면 전체가 NA로 출력
## [1] NA
  • 결측치(NA)를 제거하고 계산하려면 na,rm=TRUE 사용
mean(myNum,na.rm = TRUE)
## [1] 43.325

1.1.3 가중평균 weighted.mean()

  • 예제 1
sales=c(95,72,87,65)
weights=c(0.5,0.25,0.125,0.125)
mean(sales)
## [1] 79.75
weighted.mean(sales,weights)
## [1] 84.5
  • 예제 2
ascore=c(4,3)
bscore=c(3,4)
count=c(3,2)
weight=count/sum(count)
weighted.mean(ascore,weight)
## [1] 3.6
weighted.mean(bscore,weight)
## [1] 3.4
  • 예제 3
score=c(90,80,70,60)
count=c(3,12,15,5)
weight=count/sum(count)
weighted.mean(score,weight)
## [1] 73.71429

1.1.4 중앙값 median()

time=c(7,2,3,7,6,9,10,8,9,9,10)
mean(time)
## [1] 7.272727
median(time)
## [1] 8

1.1.5 최빈값 which.max() / which.min()

num.v=c(1,2,2,3,4,3,5,5,7,9,2,2,0)
freq=table(num.v)
which.max(freq)
## 2 
## 3
names(freq)[3]
## [1] "2"
char.v=c("o","it","the","it","it","a","a","a","a","a")
freq=table(char.v)
freq
## char.v
##   a  it   o the 
##   5   3   1   1
which.max(freq)
## a 
## 1
names(freq)[1]
## [1] "a"

1.1.6 빈도표 table()

x=c("a","b","c","c","c","d","d")
table(x)
## x
## a b c d 
## 1 1 3 2

1.1.7 최대,최소값 max(), min()

x=c(1,2,3,NA,4)
max(x,na.rm = TRUE)
## [1] 4
min(x,na.rm = TRUE)
## [1] 1

1.1.8 주사위 예제

  • 10번 시행
set.seed(1234)
dice=sample(x=1:6,size=10,replace=TRUE) # 주사위 10번 굴리기 
mean(dice) ; sd(dice)
## [1] 3.4
## [1] 1.577621
  • 1,000,000번 시행
set.seed(1234)
dice=sample(x=1:6,size=1000000,replace=TRUE) # 주사위 1,000,000번 굴리기 
mean(dice) ; sd(dice)
## [1] 3.498788
## [1] 1.708511

1.1.9 타이타닉 예제

  • 데이터 불러오기
titanic=read.csv("data/Titanic/train.csv")
head(titanic)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp
## 1                             Braund, Mr. Owen Harris   male  22     1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1
## 3                              Heikkinen, Miss. Laina female  26     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1
## 5                            Allen, Mr. William Henry   male  35     0
## 6                                    Moran, Mr. James   male  NA     0
##   Parch           Ticket    Fare Cabin Embarked
## 1     0        A/5 21171  7.2500              S
## 2     0         PC 17599 71.2833   C85        C
## 3     0 STON/O2. 3101282  7.9250              S
## 4     0           113803 53.1000  C123        S
## 5     0           373450  8.0500              S
## 6     0           330877  8.4583              Q
attach(titanic)
mean(Survived) # 생존율
## [1] 0.3838384
  • 나이에 대한 기초통계량 확인
mean(Age,na.rm = T) ; sd(Age,na.rm = T) ; var(Age,na.rm = T) # 2
## [1] 29.69912
## [1] 14.5265
## [1] 211.0191
  • 생존자 나이 평균
titanic.survived=titanic[titanic$Survived==1,]
mean(titanic.survived$Age,na.rm = T) # 3
## [1] 28.34369
weighted.mean(titanic$Age,titanic$Survived,na.rm = T)
## [1] 28.34369

1.2 자료의 분산

1.2.1 범위 max()-min()

weight=c(72,67,60,78,82)
max(weight)-min(weight)
## [1] 22

1.2.2 분산 var()

var(weight)
## [1] 76.2

1.2.3 표준편차 sd()

sd(weight)
## [1] 8.729261

1.2.4 타이타닉 예제

  • 나이에 대한 평균, 중앙값, 최대값
mean(Age, na.rm = T) ; median(Age, na.rm = T) ; which.max(Age) ;
## [1] 29.69912
## [1] 28
## [1] 631
  • 최다 좌석 등
table(titanic$Pclass)
## 
##   1   2   3 
## 216 184 491
which.max(table(titanic$Pclass))
## 3 
## 3
  • 나이 범위
range(titanic$Age,na.rm=T)
## [1]  0.42 80.00
max(titanic$Age,na.rm = T)-min(titanic$Age,na.rm = T)
## [1] 79.58
  • 운임 범위
max(titanic$Fare,na.rm = T)-min(titanic$Fare,na.rm = T)
## [1] 512.3292

1.3 자료의 위치

1.3.1 사분위수 quantile() or summary()

x=c(136,182,166,132,130,186,140,155)
quantile(x)
##    0%   25%   50%   75%  100% 
## 130.0 135.0 147.5 170.0 186.0
quantile(x,0.3) # 30% 위치의 수 
##   30% 
## 136.4
quantile(x,c(0.25,0.3)) # 25%, 30% 위치의 수 
##   25%   30% 
## 135.0 136.4
summary(x)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   130.0   135.0   147.5   153.4   170.0   186.0

1.3.2 Boxplot boxplot()

boxplot(x)

boxplot(iris[,1:4])

boxplot(iris$Sepal.Width)

1.3.3 도수분포표

score=c(88,67,76,80,86,94,78,84,82,75,80,75,65,84,78,82,71,60,87,75)

x=c(6.5,4.0,7.1,8.3,5.4,7.6,9,15.7,16.7,6.4,5,8.5,5.7,7.7,7.2,12.4,7.1,5.5,9.7,4.4,7,6.3,8.3,6.9,5.7,7.6,7.9,7.9,6,8.2,10.4,9.9,3.9,9.8,8.2,5.6,7.9,6.4,7.4,7,13,8.7,6.4,6.7,7.4)

# 계급
x.num=length(x) # 자료의 수 
x.num^(1/3) # 자료의 수^(1/3)
## [1] 3.556893
round(x.num^(1/3)) # 반올림 
## [1] 4
# 급간
round((max(x)-min(x))/4)
## [1] 3
# 도수분포표 작성
x.cut=cut(x,breaks = seq(3,18,by=3),right = FALSE)
x.cut
##  [1] [6,9)   [3,6)   [6,9)   [6,9)   [3,6)   [6,9)   [9,12)  [15,18)
##  [9] [15,18) [6,9)   [3,6)   [6,9)   [3,6)   [6,9)   [6,9)   [12,15)
## [17] [6,9)   [3,6)   [9,12)  [3,6)   [6,9)   [6,9)   [6,9)   [6,9)  
## [25] [3,6)   [6,9)   [6,9)   [6,9)   [6,9)   [6,9)   [9,12)  [9,12) 
## [33] [3,6)   [9,12)  [6,9)   [3,6)   [6,9)   [6,9)   [6,9)   [6,9)  
## [41] [12,15) [6,9)   [6,9)   [6,9)   [6,9)  
## Levels: [3,6) [6,9) [9,12) [12,15) [15,18)
  • 도수분포표 작성 자동화
x.num=length(x)
x.classNum=round(x.num^(1/3)) # 계급 생성 
x.classInterval=round(max(x)-min(x))/x.classNum # 급간 생성
x.floor=floor(min(x)) # 내림

rest=ceiling(max(x))%%x.classInterval # 최대 값 올림 
supplement=x.classInterval-rest

x.ceiling=ifelse(rest==0,ceiling(max(x)),ceiling(max(x))+supplement)
x.ceiling
## [1] 19.5
x.cut=cut(x,breaks=seq(x.floor,x.ceiling,by=x.classInterval,right=F))
## Warning: In seq.default(x.floor, x.ceiling, by = x.classInterval, right = F) :
##  extra argument 'right' will be disregarded
x.levels=gsub(",","~",levels(x.cut))
x.cut=cut(x,breaks=seq(x.floor,x.ceiling,by=x.classInterval,right=F,labels=x.levels))
## Warning: In seq.default(x.floor, x.ceiling, by = x.classInterval, right = F, 
##     labels = x.levels) :
##  extra arguments 'right', 'labels' will be disregarded
table(x.cut)
## x.cut
##   (3,6.25] (6.25,9.5] (9.5,12.8]  (12.8,16]  (16,19.2] 
##         10         27          5          2          1
  • 도수분포표 패키지
library(devtools)
install_github("SukjaeChoi/easyStats")
## Downloading GitHub repo SukjaeChoi/easyStats@master
## from URL https://api.github.com/repos/SukjaeChoi/easyStats/zipball/master
## Installing easyStats
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file  \
##   --no-environ --no-save --no-restore --quiet CMD INSTALL  \
##   '/private/var/folders/n4/wwktyc9j13g56dk5_jlxg4540000gp/T/RtmpIQUtWl/devtools420d70dafd2c/SukjaeChoi-easyStats-e9d98cd'  \
##   --library='/Library/Frameworks/R.framework/Versions/3.5/Resources/library'  \
##   --install-tests
## 
library(easyStats)
freqTable(x)
## x.cut
##   [3~6)   [6~9)  [9~12) [12~15) [15~18) 
##       9      27       5       2       2

1.3.4 상대도수분포표

x.table=table(x.cut)
round(prop.table(x.table),digits=3)*100
## x.cut
##   (3,6.25] (6.25,9.5] (9.5,12.8]  (12.8,16]  (16,19.2] 
##       22.2       60.0       11.1        4.4        2.2
x.table=freqTable(x)
round(prop.table(x.table),digits=3)*100
## x.cut
##   [3~6)   [6~9)  [9~12) [12~15) [15~18) 
##    20.0    60.0    11.1     4.4     4.4

1.4 분포의 대칭성

1.4.1 왜도(skewness)

  • 왜도가 0보다 크면 오른쪽으로 비대칭 꼬리, 0보다 작으면 왼쪽으로 비대칭 꼬리
library(moments)
skewness(x)
## [1] 1.567188
stem(x)
## 
##   The decimal point is at the |
## 
##    2 | 9
##    4 | 04045677
##    6 | 034445790011244667999
##    8 | 2233570789
##   10 | 4
##   12 | 40
##   14 | 7
##   16 | 7

1.4.2 첨도(kurtosis)

  • 첨도가 3이면 정규분포, 3보다 크면 뾰족, 3보다 작으면 완만한 형태
kurtosis(x)
## [1] 5.988724
  • 정규곡선과 비교
plot(density(x))
den.norm=function(x)dnorm(x,mean=mean(x),sd=sd(x))
curve(den.norm,col="red",add=T,lty=2)
abline(v=mean(x),col="blue",lty=2)

  • 타이타닉 예제
plot(density(Fare,na.rm = T))
den.norm=function(Fare)dnorm(Fare,mean=mean(Fare,na.rm = T),sd=sd(Fare,na.rm = T))
curve(den.norm,col="red",add=T,lty=2)
abline(v=mean(Fare),col="blue",lty=2)

age=Age[!(is.na(Age))]
plot(density(age))
den.norm=function(age)dnorm(age,mean=mean(age,na.rm = T),sd=sd(age,na.rm = T))
curve(den.norm,col="red",add=T,lty=2)
abline(v=mean(age),col="blue",lty=2)