#3.4 Annotations
library(ggplot2)
data("economics")
rm(list=ls())
ls() # 객체이름이 무엇인지 보는 함수
## character(0)
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(economics)
## Rows: 574
## Columns: 6
## $ date <date> 1967-07-01, 1967-08-01, 1967-09-01, 1967-10-01, 1967-11-01, …
## $ pce <dbl> 506.7, 509.8, 515.6, 512.2, 517.4, 525.1, 530.9, 533.6, 544.3…
## $ pop <dbl> 198712, 198911, 199113, 199311, 199498, 199657, 199808, 19992…
## $ psavert <dbl> 12.6, 12.6, 11.9, 12.9, 12.8, 11.8, 11.7, 12.3, 11.7, 12.3, 1…
## $ uempmed <dbl> 4.5, 4.7, 4.6, 4.9, 4.7, 4.8, 5.1, 4.5, 4.1, 4.6, 4.4, 4.4, 4…
## $ unemploy <dbl> 2944, 2945, 2958, 3143, 3066, 3018, 2878, 3001, 2877, 2709, 2…
ggplot(economics, aes(date, unemploy))+geom_line()

data("presidential")
glimpse(presidential)
## Rows: 11
## Columns: 4
## $ name <chr> "Eisenhower", "Kennedy", "Johnson", "Nixon", "Ford", "Carter", "…
## $ start <date> 1953-01-20, 1961-01-20, 1963-11-22, 1969-01-20, 1974-08-09, 197…
## $ end <date> 1961-01-20, 1963-11-22, 1969-01-20, 1974-08-09, 1977-01-20, 198…
## $ party <chr> "Republican", "Democratic", "Democratic", "Republican", "Republi…
head(presidential)
head(economics)
economics$date[1]
## [1] "1967-07-01"
presidential <- subset(presidential, start > economics$date[1]) #subset은 filter와 같은 기능을 하는 기본 함수
head(presidential)
presidential1 <- presidential %>% filter(start>economics$date[1])
presidential1
ggplot(economics) +
geom_rect(
aes(xmin=start, xmax=end, fill= party),
ymin = -Inf, ymax=Inf, alpha = 0.7,
data = presidential1)+
geom_vline(
aes(xintercept = as.numeric(start)),
data = presidential1,
colour = "grey50", alpha = 0.5)+
geom_text(
aes(x=start, y=2500, label= name),
data = presidential1,
size = 3, vjust = 0, hjust = 0, nudge_x = 50)+
geom_line(aes(date, unemploy))+
scale_fill_manual(values = c("blue", "red"))

ggplot(diamonds, aes(log10(carat), log10(price))) +
geom_bin2d() +
facet_wrap(~cut, nrow =1)

mod_coef <- coef(lm(log10(price) ~log10(carat), data = diamonds))
mod_coef
## (Intercept) log10(carat)
## 3.669207 1.675817
ggplot(diamonds, aes(log10(carat), log10(price))) +
geom_bin2d() +
geom_abline(intercept = mod_coef[1], slope = mod_coef[2],
colour = "white", size = 1) +
facet_wrap(~cut, nrow =1)

data(Oxboys, package = "nlme")
head(Oxboys)
glimpse(Oxboys)
## Rows: 234
## Columns: 4
## $ Subject <ord> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3…
## $ age <dbl> -1.0000, -0.7479, -0.4630, -0.1643, -0.0027, 0.2466, 0.5562, …
## $ height <dbl> 140.5, 143.4, 144.8, 147.1, 147.7, 150.2, 151.7, 153.3, 155.8…
## $ Occasion <ord> 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3…
ggplot(Oxboys, aes(age, height, group = Subject)) + #정상적인 결과
geom_point() +
geom_line()

ggplot(Oxboys, aes(age, height)) + # 오류로 이러한 결과가 나오면 안됨
geom_point() +
geom_line()

ggplot(Oxboys, aes(age, height, group = Subject)) + #정상적인 결과
geom_line() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

ggplot(Oxboys, aes(age, height)) + #정상적인 결과
geom_line(aes(group = Subject)) +
geom_smooth(method = "lm", size = 2, se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

data("airquality")
library(dplyr)
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
names(airquality) <- tolower(names(airquality))
names(airquality)
## [1] "ozone" "solar.r" "wind" "temp" "month" "day"
is.na(airquality$ozone)
## [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE
## [49] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [73] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
table(is.na(airquality)) #데이터셋의 결측치 전체 빈도 구하기
##
## FALSE TRUE
## 874 44
table(is.na(airquality$ozone))
##
## FALSE TRUE
## 116 37
summary(is.na(airquality))
## ozone solar.r wind temp
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:116 FALSE:146 FALSE:153 FALSE:153
## TRUE :37 TRUE :7
## month day
## Mode :logical Mode :logical
## FALSE:153 FALSE:153
##
sum(airquality$ozone)
## [1] NA
mean(airquality$ozone)
## [1] NA
sum(airquality$ozone, na.rm = TRUE)
## [1] 4887
mean(airquality$ozone, na.rm = TRUE)
## [1] 42.12931
airquality<-na.omit(airquality)
table(is.na(airquality))
##
## FALSE
## 666
airquality %>% filter(!is.na(ozone)) %>% head(3)
airquality %>% filter(!is.na(ozone)&!is.na(solar.r)) %>% head
mean(airquality$ozone, na.rm=TRUE)
## [1] 42.0991
airquality$ozone<-ifelse(is.na(airquality$ozone), 42.0991, airquality$ozone)
table (is.na(airquality$ozone))
##
## FALSE
## 111
ott7<-data.frame(gender=c("1","1","2","2","2","3"),
income=c(200,250,200,300,200,150))
ott7
# 상자그림을 통한 이상값 확인하기
boxplot(iris$Sepal.Width)$stats

## [,1]
## [1,] 2.2
## [2,] 2.8
## [3,] 3.0
## [4,] 3.3
## [5,] 4.0
# 8장 실전문제
getwd()
## [1] "C:/data"
library(dplyr)
library(readxl)
airseoul<-read_excel("period1.xlsx")
str(airseoul)
## tibble [1,535 × 8] (S3: tbl_df/tbl/data.frame)
## $ 날짜 : chr [1:1535] "전체" "2022-03-31" "2022-03-31" "2022-03-31" ...
## $ 측정소명 : chr [1:1535] "평균" "평균" "강남구" "강동구" ...
## $ 미세먼지 PM10
## (㎍/m3) : num [1:1535] 41 20 21 26 NA 23 19 21 23 17 ...
## $ 초미세먼지
## PM2.5 (㎍/m3): num [1:1535] 23 11 10 13 17 9 9 10 10 9 ...
## $ 오존
## O3 (ppm) : chr [1:1535] "0.026" "0.032" "0.033" "0.026" ...
## $ 이산화질소
## NO2 (ppm) : chr [1:1535] "0.026" "0.014" "0.015" "0.014" ...
## $ 일산화탄소
## CO (ppm) : chr [1:1535] "0.5" "0.3" "0.3" "0.3" ...
## $ 아황산가스
## SO2(ppm) : num [1:1535] 0.003 0.002 0.003 0.002 0.003 0.003 0.002 0.003 0.003 0.002 ...
glimpse(airseoul)
## Rows: 1,535
## Columns: 8
## $ 날짜 <chr> "전체", "2022-03-31", "2022-03-31", "202…
## $ 측정소명 <chr> "평균", "평균", "강남구", "강동구", "강…
## $ `미세먼지 PM10\r\n(㎍/m3)` <dbl> 41, 20, 21, 26, NA, 23, 19, 21, 23, 17, …
## $ `초미세먼지\r\nPM2.5 (㎍/m3)` <dbl> 23, 11, 10, 13, 17, 9, 9, 10, 10, 9, 11,…
## $ `오존\r\nO3 (ppm)` <chr> "0.026", "0.032", "0.033", "0.026", "0.0…
## $ `이산화질소\r\nNO2 (ppm)` <chr> "0.026", "0.014", "0.015", "0.014", "0.0…
## $ `일산화탄소\r\nCO (ppm)` <chr> "0.5", "0.3", "0.3", "0.3", "0.3", "0.4"…
## $ `아황산가스\r\nSO2(ppm)` <dbl> 0.003, 0.002, 0.003, 0.002, 0.003, 0.003…
names(airseoul)
## [1] "날짜" "측정소명"
## [3] "미세먼지 PM10\r\n(㎍/m3)" "초미세먼지\r\nPM2.5 (㎍/m3)"
## [5] "오존\r\nO3 (ppm)" "이산화질소\r\nNO2 (ppm)"
## [7] "일산화탄소\r\nCO (ppm)" "아황산가스\r\nSO2(ppm)"
airseoul1<-airseoul %>%
rename(date="날짜",
region="측정소명",
pm10="미세먼지 PM10\r\n(㎍/m3)",
pm2.5 = "초미세먼지\r\nPM2.5 (㎍/m3)") %>%
select(date, region, pm10, pm2.5)
table(airseoul1$date)
##
## 2022-02-01 2022-02-02 2022-02-03 2022-02-04 2022-02-05 2022-02-06 2022-02-07
## 26 26 26 26 26 26 26
## 2022-02-08 2022-02-09 2022-02-10 2022-02-11 2022-02-12 2022-02-13 2022-02-14
## 26 26 26 26 26 26 26
## 2022-02-15 2022-02-16 2022-02-17 2022-02-18 2022-02-19 2022-02-20 2022-02-21
## 26 26 26 26 26 26 26
## 2022-02-22 2022-02-23 2022-02-24 2022-02-25 2022-02-26 2022-02-27 2022-02-28
## 26 26 26 26 26 26 26
## 2022-03-01 2022-03-02 2022-03-03 2022-03-04 2022-03-05 2022-03-06 2022-03-07
## 26 26 26 26 26 26 26
## 2022-03-08 2022-03-09 2022-03-10 2022-03-11 2022-03-12 2022-03-13 2022-03-14
## 26 26 26 26 26 26 26
## 2022-03-15 2022-03-16 2022-03-17 2022-03-18 2022-03-19 2022-03-20 2022-03-21
## 26 26 26 26 26 26 26
## 2022-03-22 2022-03-23 2022-03-24 2022-03-25 2022-03-26 2022-03-27 2022-03-28
## 26 26 26 26 26 26 26
## 2022-03-29 2022-03-30 2022-03-31 전체
## 26 26 26 1
View(airseoul1) #table 확인용
airseoul1<-airseoul1 %>% filter(date!="전체"®ion!="평균")
table(airseoul1$date)
##
## 2022-02-01 2022-02-02 2022-02-03 2022-02-04 2022-02-05 2022-02-06 2022-02-07
## 25 25 25 25 25 25 25
## 2022-02-08 2022-02-09 2022-02-10 2022-02-11 2022-02-12 2022-02-13 2022-02-14
## 25 25 25 25 25 25 25
## 2022-02-15 2022-02-16 2022-02-17 2022-02-18 2022-02-19 2022-02-20 2022-02-21
## 25 25 25 25 25 25 25
## 2022-02-22 2022-02-23 2022-02-24 2022-02-25 2022-02-26 2022-02-27 2022-02-28
## 25 25 25 25 25 25 25
## 2022-03-01 2022-03-02 2022-03-03 2022-03-04 2022-03-05 2022-03-06 2022-03-07
## 25 25 25 25 25 25 25
## 2022-03-08 2022-03-09 2022-03-10 2022-03-11 2022-03-12 2022-03-13 2022-03-14
## 25 25 25 25 25 25 25
## 2022-03-15 2022-03-16 2022-03-17 2022-03-18 2022-03-19 2022-03-20 2022-03-21
## 25 25 25 25 25 25 25
## 2022-03-22 2022-03-23 2022-03-24 2022-03-25 2022-03-26 2022-03-27 2022-03-28
## 25 25 25 25 25 25 25
## 2022-03-29 2022-03-30 2022-03-31
## 25 25 25
table(airseoul1$region)
##
## 강남구 강동구 강북구 강서구 관악구 광진구 구로구 금천구
## 59 59 59 59 59 59 59 59
## 노원구 도봉구 동대문구 동작구 마포구 서대문구 서초구 성동구
## 59 59 59 59 59 59 59 59
## 성북구 송파구 양천구 영등포구 용산구 은평구 종로구 중구
## 59 59 59 59 59 59 59 59
## 중랑구
## 59
summary(airseoul1$pm10)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.00 27.00 36.00 40.54 50.00 112.00 7
summary(airseoul1$pm2.5)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 13.00 19.00 23.28 32.00 92.00 3
airseoul1<- airseoul1 %>% filter(!is.na(pm10)&!is.na(pm2.5))
airseoul1 %>% filter(pm10==max(pm10)) %>% select(date, region, pm10)
airseoul1 %>% group_by(region) %>% summarize(m=mean(pm10)) %>%
arrange(desc(m)) %>% head(5)