test1 <- data.frame(id = c(1, 2, 3, 4, 5), midterm = c(60, 80, 70, 90, 85))
test2 <- data.frame(id = c(1, 2, 3, 4, 5), final = c(70, 83, 65, 95, 80))
test1
## id midterm
## 1 1 60
## 2 2 80
## 3 3 70
## 4 4 90
## 5 5 85
test2
## id final
## 1 1 70
## 2 2 83
## 3 3 65
## 4 4 95
## 5 5 80
total <- left_join(test1, test2, by = "id")
total
## id midterm final
## 1 1 60 70
## 2 2 80 83
## 3 3 70 65
## 4 4 90 95
## 5 5 85 80
name <- data.frame(class = c(1, 2, 3, 4, 5),
teacher = c("kim", "lee", "park", "choi", "jung"))
name
## class teacher
## 1 1 kim
## 2 2 lee
## 3 3 park
## 4 4 choi
## 5 5 jung
exam <- read.csv("csv_exam.csv")
exam_name <- left_join(exam, name, by = "class")
exam_name
## id class math english science teacher
## 1 1 1 50 98 50 kim
## 2 2 1 60 97 60 kim
## 3 3 1 45 86 78 kim
## 4 4 1 30 98 58 kim
## 5 5 2 25 80 65 lee
## 6 6 2 50 89 98 lee
## 7 7 2 80 90 45 lee
## 8 8 2 90 78 25 lee
## 9 9 3 20 98 15 park
## 10 10 3 50 98 45 park
## 11 11 3 65 65 65 park
## 12 12 3 45 85 32 park
## 13 13 4 46 98 65 choi
## 14 14 4 48 87 12 choi
## 15 15 4 75 56 78 choi
## 16 16 4 58 98 65 choi
## 17 17 5 65 68 98 jung
## 18 18 5 80 78 90 jung
## 19 19 5 89 68 87 jung
## 20 20 5 78 83 58 jung
group_a <- data.frame(id = c(1, 2, 3, 4, 5),
test = c(60, 80, 70, 90, 85))
group_b <- data.frame(id = c(1, 2, 3, 4, 5),
test = c(70, 83, 65, 95, 80))
group_a
## id test
## 1 1 60
## 2 2 80
## 3 3 70
## 4 4 90
## 5 5 85
group_b
## id test
## 1 1 70
## 2 2 83
## 3 3 65
## 4 4 95
## 5 5 80
bind_rows() 를 이용, 데이터를 세로로 합침, 변수명이 다르면 rename()으로 동일하게 맞춘 후 합침.
group_all <- bind_rows(group_a, group_b)
group_all
## id test
## 1 1 60
## 2 2 80
## 3 3 70
## 4 4 90
## 5 5 85
## 6 1 70
## 7 2 83
## 8 3 65
## 9 4 95
## 10 5 80
mpg 데이터의 fl 변수는 자동차에 사용하는 연료(fuel)를 의미합니다. 오른쪽은 자동차 연료별 가격을 나타낸 표입니다.
fl | 연료 종류 | 가격 (갤런당 USD) |
---|---|---|
c | CNG | 2.35 |
d | diesel | 2.38 |
e | ethanol E85 | 2.11 |
p | premium | 2.76 |
r | regular | 2.22 |
우선 이 정보를 이용해 연료와 가격으로 구성된 데이터프레임을 만들어 보세요
fuel <- data.frame(fl = c("c", "d", "e", "p", "r"),
price_fl = c(2.35, 2.38, 2.11, 2.76, 2.22),
stringsAsFactors = F)
fuel
## fl price_fl
## 1 c 2.35
## 2 d 2.38
## 3 e 2.11
## 4 p 2.76
## 5 r 2.22
Q1. mpg 데이터에는 연료 종류를 나타낸 fl 변수는 있지만 연료가격을 나타낸 변수는 없습니다. 위에서 만든 fuel 데이터를 이용해 mpg 데이터에 price_fl(연료가격) 변수를 추가하세요
mpg_new <- left_join(mpg, fuel, by = "fl")
Q2. 연료가격 변수가 잘 추가됐는지 확인하기 위해 model, fl, price_fl 변수를 추출해 앞부분 5행을 출력해보세요
mpg_new %>%
select(model, fl, price_fl)%>%
head(5)
## # A tibble: 5 x 3
## model fl price_fl
## <chr> <chr> <dbl>
## 1 a4 p 2.76
## 2 a4 p 2.76
## 3 a4 p 2.76
## 4 a4 p 2.76
## 5 a4 p 2.76
미국 동북중부 437개 지역의 인구통계 정보를 담고 있는 midwest 데이터를 이용해 분석문제를 해결해 보세요. midwest 는 ggplot2 패키지에 있습니다.
midwest <- as.data.frame(ggplot2::midwest)
str(midwest)
## 'data.frame': 437 obs. of 28 variables:
## $ PID : int 561 562 563 564 565 566 567 568 569 570 ...
## $ county : chr "ADAMS" "ALEXANDER" "BOND" "BOONE" ...
## $ state : chr "IL" "IL" "IL" "IL" ...
## $ area : num 0.052 0.014 0.022 0.017 0.018 0.05 0.017 0.027 0.024 0.058 ...
## $ poptotal : int 66090 10626 14991 30806 5836 35688 5322 16805 13437 173025 ...
## $ popdensity : num 1271 759 681 1812 324 ...
## $ popwhite : int 63917 7054 14477 29344 5264 35157 5298 16519 13384 146506 ...
## $ popblack : int 1702 3496 429 127 547 50 1 111 16 16559 ...
## $ popamerindian : int 98 19 35 46 14 65 8 30 8 331 ...
## $ popasian : int 249 48 16 150 5 195 15 61 23 8033 ...
## $ popother : int 124 9 34 1139 6 221 0 84 6 1596 ...
## $ percwhite : num 96.7 66.4 96.6 95.3 90.2 ...
## $ percblack : num 2.575 32.9 2.862 0.412 9.373 ...
## $ percamerindan : num 0.148 0.179 0.233 0.149 0.24 ...
## $ percasian : num 0.3768 0.4517 0.1067 0.4869 0.0857 ...
## $ percother : num 0.1876 0.0847 0.2268 3.6973 0.1028 ...
## $ popadults : int 43298 6724 9669 19272 3979 23444 3583 11323 8825 95971 ...
## $ perchsd : num 75.1 59.7 69.3 75.5 68.9 ...
## $ percollege : num 19.6 11.2 17 17.3 14.5 ...
## $ percprof : num 4.36 2.87 4.49 4.2 3.37 ...
## $ poppovertyknown : int 63628 10529 14235 30337 4815 35107 5241 16455 13081 154934 ...
## $ percpovertyknown : num 96.3 99.1 95 98.5 82.5 ...
## $ percbelowpoverty : num 13.15 32.24 12.07 7.21 13.52 ...
## $ percchildbelowpovert: num 18 45.8 14 11.2 13 ...
## $ percadultpoverty : num 11.01 27.39 10.85 5.54 11.14 ...
## $ percelderlypoverty : num 12.44 25.23 12.7 6.22 19.2 ...
## $ inmetro : int 0 0 0 1 0 0 0 0 0 1 ...
## $ category : chr "AAR" "LHR" "AAR" "ALU" ...
head(midwest)
## PID county state area poptotal popdensity popwhite popblack
## 1 561 ADAMS IL 0.052 66090 1270.9615 63917 1702
## 2 562 ALEXANDER IL 0.014 10626 759.0000 7054 3496
## 3 563 BOND IL 0.022 14991 681.4091 14477 429
## 4 564 BOONE IL 0.017 30806 1812.1176 29344 127
## 5 565 BROWN IL 0.018 5836 324.2222 5264 547
## 6 566 BUREAU IL 0.050 35688 713.7600 35157 50
## popamerindian popasian popother percwhite percblack percamerindan
## 1 98 249 124 96.71206 2.5752761 0.1482826
## 2 19 48 9 66.38434 32.9004329 0.1788067
## 3 35 16 34 96.57128 2.8617170 0.2334734
## 4 46 150 1139 95.25417 0.4122574 0.1493216
## 5 14 5 6 90.19877 9.3728581 0.2398903
## 6 65 195 221 98.51210 0.1401031 0.1821340
## percasian percother popadults perchsd percollege percprof
## 1 0.37675897 0.18762294 43298 75.10740 19.63139 4.355859
## 2 0.45172219 0.08469791 6724 59.72635 11.24331 2.870315
## 3 0.10673071 0.22680275 9669 69.33499 17.03382 4.488572
## 4 0.48691813 3.69733169 19272 75.47219 17.27895 4.197800
## 5 0.08567512 0.10281014 3979 68.86152 14.47600 3.367680
## 6 0.54640215 0.61925577 23444 76.62941 18.90462 3.275891
## poppovertyknown percpovertyknown percbelowpoverty percchildbelowpovert
## 1 63628 96.27478 13.151443 18.01172
## 2 10529 99.08714 32.244278 45.82651
## 3 14235 94.95697 12.068844 14.03606
## 4 30337 98.47757 7.209019 11.17954
## 5 4815 82.50514 13.520249 13.02289
## 6 35107 98.37200 10.399635 14.15882
## percadultpoverty percelderlypoverty inmetro category
## 1 11.009776 12.443812 0 AAR
## 2 27.385647 25.228976 0 LHR
## 3 10.852090 12.697410 0 AAR
## 4 5.536013 6.217047 1 ALU
## 5 11.143211 19.200000 0 AAR
## 6 8.179287 11.008586 0 AAR
midwest_new <- midwest %>%
mutate(ratio_child = (poptotal-popadults)/poptotal * 100)
str(midwest_new)
## 'data.frame': 437 obs. of 29 variables:
## $ PID : int 561 562 563 564 565 566 567 568 569 570 ...
## $ county : chr "ADAMS" "ALEXANDER" "BOND" "BOONE" ...
## $ state : chr "IL" "IL" "IL" "IL" ...
## $ area : num 0.052 0.014 0.022 0.017 0.018 0.05 0.017 0.027 0.024 0.058 ...
## $ poptotal : int 66090 10626 14991 30806 5836 35688 5322 16805 13437 173025 ...
## $ popdensity : num 1271 759 681 1812 324 ...
## $ popwhite : int 63917 7054 14477 29344 5264 35157 5298 16519 13384 146506 ...
## $ popblack : int 1702 3496 429 127 547 50 1 111 16 16559 ...
## $ popamerindian : int 98 19 35 46 14 65 8 30 8 331 ...
## $ popasian : int 249 48 16 150 5 195 15 61 23 8033 ...
## $ popother : int 124 9 34 1139 6 221 0 84 6 1596 ...
## $ percwhite : num 96.7 66.4 96.6 95.3 90.2 ...
## $ percblack : num 2.575 32.9 2.862 0.412 9.373 ...
## $ percamerindan : num 0.148 0.179 0.233 0.149 0.24 ...
## $ percasian : num 0.3768 0.4517 0.1067 0.4869 0.0857 ...
## $ percother : num 0.1876 0.0847 0.2268 3.6973 0.1028 ...
## $ popadults : int 43298 6724 9669 19272 3979 23444 3583 11323 8825 95971 ...
## $ perchsd : num 75.1 59.7 69.3 75.5 68.9 ...
## $ percollege : num 19.6 11.2 17 17.3 14.5 ...
## $ percprof : num 4.36 2.87 4.49 4.2 3.37 ...
## $ poppovertyknown : int 63628 10529 14235 30337 4815 35107 5241 16455 13081 154934 ...
## $ percpovertyknown : num 96.3 99.1 95 98.5 82.5 ...
## $ percbelowpoverty : num 13.15 32.24 12.07 7.21 13.52 ...
## $ percchildbelowpovert: num 18 45.8 14 11.2 13 ...
## $ percadultpoverty : num 11.01 27.39 10.85 5.54 11.14 ...
## $ percelderlypoverty : num 12.44 25.23 12.7 6.22 19.2 ...
## $ inmetro : int 0 0 0 1 0 0 0 0 0 1 ...
## $ category : chr "AAR" "LHR" "AAR" "ALU" ...
## $ ratio_child : num 34.5 36.7 35.5 37.4 31.8 ...
midwest_new %>%
select(county, ratio_child) %>%
arrange(desc(ratio_child)) %>%
head(5)
## county ratio_child
## 1 ISABELLA 51.50117
## 2 MENOMINEE 50.59126
## 3 ATHENS 49.32073
## 4 MECOSTA 49.05918
## 5 MONROE 47.35818
분류 | 기준 |
---|---|
large | 40% 이상 |
middle | 30~40% 이상 |
small | 30% 미만 |
midwest_new %>%
mutate(level = ifelse(ratio_child >= 40, "large",
ifelse(ratio_child >= 30, "middle", "small"))) %>%
group_by(level) %>%
summarise(count = n())
## # A tibble: 3 x 2
## level count
## <chr> <int>
## 1 large 32
## 2 middle 396
## 3 small 9
midwest_new %>%
mutate(ratio_asian = (popasian / poptotal) * 100) %>%
select(state, county, ratio_asian) %>%
arrange(ratio_asian) %>%
head(10)
## state county ratio_asian
## 1 WI MENOMINEE 0.00000000
## 2 IN BENTON 0.01059210
## 3 IN CARROLL 0.01594981
## 4 OH VINTON 0.02703190
## 5 WI IRON 0.03250447
## 6 IL SCOTT 0.05315379
## 7 IN CLAY 0.06071645
## 8 MI OSCODA 0.06375925
## 9 OH PERRY 0.06654625
## 10 IL PIATT 0.07074865