data("airquality")
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
names(airquality) <- tolower(names(airquality))
names(airquality)
## [1] "ozone" "solar.r" "wind" "temp" "month" "day"
is.na(airquality$ozone)
## [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE
## [49] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [73] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
table(is.na(airquality)) #데이터셋의 결측치 전체 빈도 구하기
##
## FALSE TRUE
## 874 44
table(is.na(airquality$ozone))
##
## FALSE TRUE
## 116 37
summary(is.na(airquality))
## ozone solar.r wind temp
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:116 FALSE:146 FALSE:153 FALSE:153
## TRUE :37 TRUE :7
## month day
## Mode :logical Mode :logical
## FALSE:153 FALSE:153
##
sum(airquality$ozone)
## [1] NA
mean(airquality$ozone)
## [1] NA
sum(airquality$ozone, na.rm = TRUE)
## [1] 4887
mean(airquality$ozone, na.rm = TRUE)
## [1] 42.12931
airquality<-na.omit(airquality)
table(is.na(airquality))
##
## FALSE
## 666
airquality %>% filter(!is.na(ozone)) %>% head(3)
## ozone solar.r wind temp month day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
airquality %>% filter(!is.na(ozone)&!is.na(solar.r)) %>% head
## ozone solar.r wind temp month day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 23 299 8.6 65 5 7
## 6 19 99 13.8 59 5 8
mean(airquality$ozone, na.rm=TRUE)
## [1] 42.0991
airquality$ozone<-ifelse(is.na(airquality$ozone), 42.0991, airquality$ozone)
table (is.na(airquality$ozone))
##
## FALSE
## 111
ott7<-data.frame(gender=c("1","1","2","2","2","3"),
income=c(200,250,200,300,200,150))
ott7
## gender income
## 1 1 200
## 2 1 250
## 3 2 200
## 4 2 300
## 5 2 200
## 6 3 150
# 상자그림을 통한 이상값 확인하기
boxplot(iris$Sepal.Width)$stats

## [,1]
## [1,] 2.2
## [2,] 2.8
## [3,] 3.0
## [4,] 3.3
## [5,] 4.0
#IQR = Q3 - Q1
#밑 끝선 Q1 - 1.5 X IQR
#위 끝선 Q3 + 1.5 X IQR