Ch2: descriptive statistics

0.1 Prerequisite

Read file Rstudio_02 in eTL

0.2 Summary Statistics for Describing Data

## data input

gender <- factor(c("M", "M", "M", "M", "M", "M", "M", "M",
                   "F", "F", "F", "F", "F", "F", "F", "F", "F", "F"))
class(gender)

## [1] "factor"

gender <- factor(rep(c("M", "F"), times = c(8, 10)))     # equivalent
score <- c(98, 90, 96, 54, 43, 87, 88, 90,
           94, 92, 81, 79, 85, 91, 79, 88, 89, 83)
df <- data.frame(gender, score)

## histogram
hist(score)

# ?hist
hist(score, freq = FALSE)  # density

hist(score, breaks = 10)   # number of bins = 10

## plots
stem(score)  # Stem-and-Leaf Plots

## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   4 | 34
##   6 | 99
##   8 | 13578890012468

stem(score, scale = 2)

## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   4 | 3
##   5 | 4
##   6 | 
##   7 | 99
##   8 | 1357889
##   9 | 0012468

boxplot(score)

## summary
table(gender)  # like pivot table in excel

## gender
##  F  M 
## 10  8

table(gender)[1]

##  F 
## 10

fivenum(score)  # minimum, 1Q, median, 2Q, maximum

## [1] 43 81 88 91 98

summary(score)  # frequently used

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   43.00   81.50   88.00   83.72   90.75   98.00

data(iris)
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

head(iris)

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

quantile(score, probs = 0.75)

##   75% 
## 90.75

quantile(score, probs = c(0, 0.25, 0.5, 0.75, 1))

##    0%   25%   50%   75%  100% 
## 43.00 81.50 88.00 90.75 98.00

IQR(score)

## [1] 9.25

var(score)

## [1] 195.9771

sd(score)

## [1] 13.99918

min(score)

## [1] 43

max(score)

## [1] 98

median(score)

## [1] 88

## correlation and variance
math <- c(66, 64, 48, 46, 78, 60, 90, 50, 66, 70)
phy  <- c(70, 68, 46, 48, 84, 64, 92, 52, 68, 72)

var(math, phy)      # sample covariance

## [1] 202.7556

var(math, phy) / sqrt(var(math) * var(phy))  # sample correlation

## [1] 0.9918056

cor(math, phy)      # equivalent

## [1] 0.9918056

0.2.1 Bivariate Data

# scatter plot
plot(math, phy)

plot(math, phy,
     main = "Math score and Physics score", # 제목
     sub = "(total : 10)",         # 소제목
     xlim = c(0, 100),             # plot에 그려질 x값의 limit
     ylim = c(0,100),              # plot에 그려질 y값의 limit
     xlab = "Math",                # x축 label
     ylab = "Physics",                # y축 label
     type="p")                     # 데이터 표시타입(p = points)

plot(1:10, 1:10, type = "b", col = "red")  # b = both (line and points)

plot(1:10, 1:10, type = "b", col = 2)      # equivalent

For other colors for plotting, see this

For graphical parameters, look
?plot.default
?par

0.3 Other Topics: Summary and Visualiztion (Not for Exam)

How can you learn useful packages by yourself?
Typing ‘ggplot2 in cran’ in Google!

Comparing ggplot2 and R Base Graphics

Reproduce a Plot in The Elements of Statistical Learning

Sankey Diagram

< Charles Minard’s 1869 chart showing the number of men in Napoleon’s 1812 Russian campaign arm, Source : Wiki >

## Install packages
# install.packages("dplyr")
# install.packages("Hmisc")
# call packages
library(dplyr)
library(Hmisc)

## Warning: package 'Hmisc' was built under R version 3.4.1

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

## recommend to use 'packagename::function()' for readibility
dplyr::glimpse(iris)

## Observations: 150
## Variables: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9,...
## $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5,...
## $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1,...
## $ Species      <fct> setosa, setosa, setosa, setosa, setosa, setosa, s...

Hmisc::describe(iris)

## iris 
## 
##  5  Variables      150  Observations
## ---------------------------------------------------------------------------
## Sepal.Length 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       35    0.998    5.843   0.9462    4.600    4.800 
##      .25      .50      .75      .90      .95 
##    5.100    5.800    6.400    6.900    7.255 
## 
## lowest : 4.3 4.4 4.5 4.6 4.7, highest: 7.3 7.4 7.6 7.7 7.9
## ---------------------------------------------------------------------------
## Sepal.Width 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       23    0.992    3.057   0.4872    2.345    2.500 
##      .25      .50      .75      .90      .95 
##    2.800    3.000    3.300    3.610    3.800 
## 
## lowest : 2.0 2.2 2.3 2.4 2.5, highest: 3.9 4.0 4.1 4.2 4.4
## ---------------------------------------------------------------------------
## Petal.Length 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       43    0.998    3.758    1.979     1.30     1.40 
##      .25      .50      .75      .90      .95 
##     1.60     4.35     5.10     5.80     6.10 
## 
## lowest : 1.0 1.1 1.2 1.3 1.4, highest: 6.3 6.4 6.6 6.7 6.9
## ---------------------------------------------------------------------------
## Petal.Width 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       22     0.99    1.199   0.8676      0.2      0.2 
##      .25      .50      .75      .90      .95 
##      0.3      1.3      1.8      2.2      2.3 
## 
## lowest : 0.1 0.2 0.3 0.4 0.5, highest: 2.1 2.2 2.3 2.4 2.5
## ---------------------------------------------------------------------------
## Species 
##        n  missing distinct 
##      150        0        3 
##                                            
## Value          setosa versicolor  virginica
## Frequency          50         50         50
## Proportion      0.333      0.333      0.333
## ---------------------------------------------------------------------------

Useful Contents

(Book) Ggplot2: Elegant Graphics for Data Analysis

0.4 Quiz2 (See ‘Rstudio_02 내 2.4.예제’)

due on 2018/3/20 11:59PM
[자료를 활용한 예제]
행동위험요인 감시시스템(The Behavioral Risk Factor Surveillance System)은 매년 미국에서 시행되는 대규모 전화 설문 조사이다. 이 조사에서는 응답자들의 현재 건강 상태 및 그들의 건강과 관련된 생활 습관 등을 조사한다.
이 조사에 관한 자세한 내용은 BRFSS 의 웹사이트에서 확인할 수 있다. 주어진 자료는 2000년도에 시행된 20,000명의 BRFSS 조사 데이터의 일부이며 전체 200개 이상의 항목 중에서 간추린 9개의 항목을 포함하고 있다. 각 변수에 대한 설명은 다음과 같다.

genhlth : 범주형 자료, 전반적인 건강상태 (excellent / very good / good / fair / poor)
exerany : 범주형 자료, 지난달의 운동 여부 (1 = yes, 0 = no)
hlthplan : 범주형 자료, 건강보험 가입 여부 (1 = yes, 0 = no)
smoke100 : 범주형 자료, 현재까지 최소 100개피 이상의 담배 흡연 여부 (1 = yes, 0 = no)
height : 숫자형 자료, 신장 (inch)
weight : 숫자형 자료, 체중 (pound)
wtdesire : 숫자형 자료, 응답자가 생각하는 본인의 이상적인 체중 (pound)
age : 숫자형 자료, 나이 (year)
gender : 범주형 자료, 성별 (m = 남성, f = 여성)

[Step 1: 파일(cdc.txt) 불러오기]

# read.csv("filepath", header = TRUE, sep = ',')
# read.table("filepath", header = TRUE)
cdc <- read.table("DAT\\ch02\\cdc.txt", header = TRUE)
cdc

# read.table("clipboard", header = TRUE)        # [Ctrl+C] > clipboard로 데이터 불러오기

[Step 2: Dataframe 다루기]

class(cdc)

## [1] "data.frame"

str(cdc)

## 'data.frame':    20000 obs. of  9 variables:
##  $ genhlth : Factor w/ 5 levels "excellent","fair",..: 3 3 3 3 5 5 5 5 3 3 ...
##  $ exerany : int  0 0 1 1 0 1 1 0 0 1 ...
##  $ hlthplan: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ smoke100: int  0 1 1 0 0 0 0 0 1 0 ...
##  $ height  : int  70 64 60 66 61 64 71 67 65 70 ...
##  $ weight  : int  175 125 105 132 150 114 194 170 150 180 ...
##  $ wtdesire: int  175 115 105 124 130 114 185 160 130 170 ...
##  $ age     : int  77 33 49 42 55 55 31 45 27 44 ...
##  $ gender  : Factor w/ 2 levels "f","m": 2 1 1 1 1 1 2 2 1 2 ...

head(cdc)

tail(cdc)

head(cdc$genhlth)

## [1] good      good      good      good      very good very good
## Levels: excellent fair good poor very good

[예제 1]. genhlth 변수에 대해 적절한 방법을 이용하여 요약해보자. 범주형 자료의 경우에는 어떠한 요약 방법을 사용할 수 있는가?(table())

[예제 2]. weight 변수에 대한 수치적 요약 값을 구해보자. 전체 응답자의 평균 몸무게는 얼마 인가?(mean(), summary(), IQR(), quantile(), var(), …)

[예제 3]. weight 변수와 wtdesire 변수의 산점도를 그려보자. 두 변수 사이에는 어떠한 관계가 존재한다고 보여지는가? 두 변수의 상관계수는 무엇은 나타내고 있는가?(plot( , main = ""), cor())

[예제 4]. wtdesire 변수와 weight 변수의 차를 계산하여 새로운 변수 wdiff 를 만들어보자. wdiff 의 분포는 어떠한가? 수치적 요약과 그래프 요약을 통해 살펴보자. 이것이 의미하는 바 는 무엇인가?(summary(), boxplot( , main = ""))

[예제 5]. age 변수를 이용하여 히스토그램을 그려보자. 그리고 구간의 수를 50, 100으로 바꿔 가며 동일한 히스토그램을 그린 후 비교해보자.(hist( , main = ""))
(참고) 히스토그램은 자료의 형태를 파악하기 위한 쉬운 방법이지만 구간의 수가 달라짐에 따라 그 모양이 조금씩 달라질 수 있다.