# 결측치와 이상치 처리
# 01 결측치
# 데이터에서 값이 비어 있는 상태를 의미
rm(list=ls())
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data('airquality')
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
# R에서는 NA 결측값을 의미합니다.
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
# is.na() 결측값을 TRUE로 출력합니다.
table(is.na(airquality))
##
## FALSE TRUE
## 874 44
# 결측값이 존재하면 연산을 할 수가 없습니다.
# sum(airquality$Ozone)
# mean(airquality$Ozone)
sum(airquality$Ozone,na.rm=TRUE)
## [1] 4887
mean(airquality$Ozone,na.rm=TRUE)
## [1] 42.12931
# na.rm=TRUE: 결측값을 제외해주세요
# names(airquality) <-tolower(names(airquality))
# is.na(airquality$Ozone)
# table(is.na(airquality))
# table(is.na(airquality$Ozone))
# summary(is.na(airquality))
# sum(airquality$Ozone)
# mean(airquality$Ozone)
# sum(airquality$Ozone,na.rm=TRUE)
# mean(airquality$Ozone,na.rm=TRUE)
# 상자그림(boxplot)은 데이터의 분포와 이상치를 한눈에
# 보여주는 시각화 도구입니다
# 중앙값, 사분위수, 범위, 이상치 등을 확인하는 데 널리 사용
data(iris)
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
boxplot(iris$Petal.Length~iris$Species,data=iris)

data("ChickWeight")
summary(ChickWeight)
## weight Time Chick Diet
## Min. : 35.0 Min. : 0.00 13 : 12 1:220
## 1st Qu.: 63.0 1st Qu.: 4.00 9 : 12 2:120
## Median :103.0 Median :10.00 20 : 12 3:120
## Mean :121.8 Mean :10.72 10 : 12 4:118
## 3rd Qu.:163.8 3rd Qu.:16.00 17 : 12
## Max. :373.0 Max. :21.00 19 : 12
## (Other):506
boxplot(ChickWeight$weight~ChickWeight$Diet,data=ChickWeight)

data("chickwts")
summary(chickwts)
## weight feed
## Min. :108.0 casein :12
## 1st Qu.:204.5 horsebean:10
## Median :258.0 linseed :12
## Mean :261.3 meatmeal :11
## 3rd Qu.:323.5 soybean :14
## Max. :423.0 sunflower:12
boxplot(chickwts$weight~chickwts$feed,data=chickwts)

hist(ChickWeight$weight)

# weight 변수의 경우 평균(mean)>중앙값(median)이므로 오른쪽 꼬리 분포이다
# 이상값 탐색할 수 있다.
# 전체 관측치 수를 파악할 수 없다
# 결측치의 수를 확인할 수 없다.( 상자그림은 결측치를 제외하고 분석한다.)
density(ChickWeight$weight)
##
## Call:
## density.default(x = ChickWeight$weight)
##
## Data: ChickWeight$weight (578 obs.); Bandwidth 'bw' = 17.93
##
## x y
## Min. :-18.79 Min. :4.765e-07
## 1st Qu.: 92.61 1st Qu.:2.908e-04
## Median :204.00 Median :1.326e-03
## Mean :204.00 Mean :2.240e-03
## 3rd Qu.:315.39 3rd Qu.:3.976e-03
## Max. :426.79 Max. :7.323e-03
ott7<-data.frame(gender = c('1','1','2','2','2','3'),
income = c(200,250,200,300,200,150))
ott7
## gender income
## 1 1 200
## 2 1 250
## 3 2 200
## 4 2 300
## 5 2 200
## 6 3 150
table(ott7$gender)
##
## 1 2 3
## 2 3 1