library(readr)
library(magrittr)
library(tidyr)
package ‘tidyr’ was built under R version 3.5.2
library(dplyr)
package ‘dplyr’ was built under R version 3.5.2
library(Hmisc)
package ‘Hmisc’ was built under R version 3.5.2package ‘ggplot2’ was built under R version 3.5.2
library(outliers)
who <- read_csv("WHO.csv")
Parsed with column specification:
cols(
.default = col_double(),
country = [31mcol_character()[39m,
iso2 = [31mcol_character()[39m,
iso3 = [31mcol_character()[39m
)
See spec(...) for full column specifications.
who
who_1 <- who %>% gather(c(5:60), key = "code", value = "value")
who_1
who_2 <- who_1 %>% separate(code, into = c("new","var","sex_age"), sep="_")
who_2 <- who_2 %>% separate(sex_age, into = c("sex","age"), sep = 1)
who_2
who_3 <- who_2 %>% spread(key = var, value = value)
who_3
who_4 <- who_3 %>% mutate(sex = factor(sex, levels = c("m","f")), age = factor(age, levels = c("014","1524","2534","3544","4554","5564","65"), labels = c("<15","15-24","25-34","35-44","45-54","55-64", "65>="), ordered = T))
who_4
who_tidy <- who_4 %>% select(., -c(iso2,new))
WHO_subset <- who_tidy %>% filter(country == "Australia"|country =="China"|country =="Russian Federation")
WHO_subset
species <- read_csv("species.csv")
Parsed with column specification:
cols(
species_id = [31mcol_character()[39m,
genus = [31mcol_character()[39m,
species = [31mcol_character()[39m,
taxa = [31mcol_character()[39m
)
surveys <- read_csv("surveys.csv")
Parsed with column specification:
cols(
record_id = [32mcol_double()[39m,
month = [32mcol_double()[39m,
day = [32mcol_double()[39m,
year = [32mcol_double()[39m,
species_id = [31mcol_character()[39m,
sex = [31mcol_character()[39m,
hindfoot_length = [32mcol_double()[39m,
weight = [32mcol_double()[39m
)
head(species)
head(surveys)
surveys_combined <- left_join(surveys, species, by = "species_id")
head(surveys_combined)
flavus_avg <- surveys_combined %>% filter(., species == "flavus") %>% group_by(., month) %>% summarise(., avg_weight = mean(weight, na.rm = T), avg_hindfoot = mean(hindfoot_length, na.rm = T))
head(flavus_avg)
surveys_combined_year <- surveys_combined %>% filter(., year == "1977")
surveys_combined_year %>% group_by(.,species) %>% summarise (total_na = sum(is.na(weight)))
surveys_weight_imputed <- surveys_combined_year %>% group_by(species) %>% mutate(weight = zoo::na.aggregate(weight, mean))
Inspect the weight column in surveys_weight_imputed data frame for any further inconsistencies or special values (i.e., NaN, Inf, -Inf) . Trace back and explain briefly why you got such a value.
# This is a chunk for Task 9. Provide your R codes here:
library(editrules)
Loading required package: igraph
Attaching package: ‘igraph’
The following objects are masked from ‘package:dplyr’:
as_data_frame, groups, union
The following object is masked from ‘package:tidyr’:
crossing
The following objects are masked from ‘package:stats’:
decompose, spectrum
The following object is masked from ‘package:base’:
union
Attaching package: ‘editrules’
The following objects are masked from ‘package:igraph’:
blocks, normalize
The following object is masked from ‘package:dplyr’:
contains
The following objects are masked from ‘package:tidyr’:
contains, separate
(Rule1 <- editset("weight > 0"))
Edit set:
num1 : 0 < weight
violatedEdits(Rule1, surveys_weight_imputed)
edit
record num1
1 NA
2 NA
3 FALSE
4 FALSE
5 FALSE
6 FALSE
7 FALSE
8 FALSE
9 FALSE
10 FALSE
11 FALSE
12 FALSE
13 FALSE
14 FALSE
15 FALSE
16 FALSE
17 FALSE
18 FALSE
19 FALSE
20 FALSE
21 FALSE
22 NA
23 FALSE
24 NA
25 FALSE
26 FALSE
27 FALSE
28 FALSE
29 FALSE
30 FALSE
31 FALSE
32 FALSE
33 FALSE
34 FALSE
35 FALSE
36 FALSE
37 FALSE
38 NA
39 FALSE
40 FALSE
41 FALSE
42 FALSE
43 FALSE
44 FALSE
45 FALSE
46 FALSE
47 FALSE
48 FALSE
49 FALSE
50 FALSE
51 FALSE
52 FALSE
53 FALSE
54 FALSE
55 FALSE
56 FALSE
57 FALSE
58 FALSE
59 FALSE
60 FALSE
61 FALSE
62 FALSE
63 FALSE
64 FALSE
65 FALSE
66 FALSE
67 FALSE
68 FALSE
69 FALSE
70 FALSE
71 FALSE
72 NA
73 FALSE
74 FALSE
75 FALSE
76 FALSE
77 NA
78 FALSE
79 FALSE
80 FALSE
81 FALSE
82 FALSE
83 FALSE
84 FALSE
85 FALSE
86 FALSE
87 FALSE
88 FALSE
89 FALSE
90 FALSE
91 FALSE
92 FALSE
93 FALSE
94 FALSE
95 FALSE
96 FALSE
97 FALSE
98 FALSE
99 FALSE
100 FALSE
101 FALSE
102 FALSE
103 FALSE
104 FALSE
105 FALSE
106 NA
107 NA
108 FALSE
109 FALSE
110 FALSE
111 FALSE
112 FALSE
113 FALSE
114 FALSE
115 FALSE
116 FALSE
117 FALSE
118 FALSE
119 FALSE
120 FALSE
121 NA
122 FALSE
123 FALSE
124 FALSE
125 FALSE
126 FALSE
127 FALSE
128 FALSE
129 FALSE
130 FALSE
131 FALSE
132 FALSE
133 FALSE
134 FALSE
135 FALSE
136 FALSE
137 FALSE
138 FALSE
139 FALSE
140 FALSE
141 FALSE
142 FALSE
143 FALSE
144 FALSE
145 FALSE
146 FALSE
147 FALSE
148 FALSE
149 FALSE
150 FALSE
151 FALSE
152 FALSE
153 FALSE
154 FALSE
155 FALSE
156 FALSE
157 FALSE
158 FALSE
159 FALSE
160 FALSE
161 FALSE
162 FALSE
163 FALSE
164 FALSE
165 FALSE
166 FALSE
167 FALSE
168 FALSE
169 FALSE
170 FALSE
171 NA
172 FALSE
173 FALSE
174 FALSE
175 FALSE
176 FALSE
177 FALSE
178 FALSE
179 FALSE
180 FALSE
181 FALSE
182 FALSE
183 FALSE
184 FALSE
185 FALSE
186 FALSE
187 FALSE
188 FALSE
189 FALSE
190 FALSE
191 FALSE
192 FALSE
193 FALSE
194 NA
195 FALSE
196 FALSE
197 FALSE
198 FALSE
199 FALSE
200 FALSE
201 FALSE
202 FALSE
203 FALSE
204 FALSE
205 FALSE
206 FALSE
207 FALSE
208 FALSE
209 FALSE
210 FALSE
211 FALSE
212 FALSE
213 FALSE
214 NA
215 FALSE
216 FALSE
217 FALSE
218 FALSE
219 FALSE
220 FALSE
221 FALSE
222 FALSE
223 FALSE
224 NA
225 FALSE
226 FALSE
227 FALSE
228 FALSE
229 FALSE
230 FALSE
231 FALSE
232 FALSE
233 FALSE
234 FALSE
235 FALSE
236 FALSE
237 FALSE
238 FALSE
239 FALSE
240 FALSE
241 FALSE
242 FALSE
243 FALSE
244 FALSE
245 FALSE
246 FALSE
247 FALSE
248 FALSE
249 FALSE
250 FALSE
251 FALSE
252 FALSE
253 FALSE
254 FALSE
255 FALSE
256 FALSE
257 FALSE
258 FALSE
259 FALSE
260 FALSE
261 FALSE
262 FALSE
263 FALSE
264 FALSE
265 FALSE
266 NA
267 FALSE
268 FALSE
269 FALSE
270 FALSE
271 FALSE
272 NA
273 FALSE
274 FALSE
275 FALSE
276 FALSE
277 FALSE
278 FALSE
279 FALSE
280 FALSE
281 NA
282 NA
283 FALSE
284 FALSE
285 FALSE
286 FALSE
287 FALSE
288 FALSE
289 FALSE
290 FALSE
291 FALSE
292 FALSE
293 FALSE
294 FALSE
295 FALSE
296 FALSE
297 NA
298 FALSE
299 FALSE
300 FALSE
301 FALSE
302 FALSE
303 FALSE
304 FALSE
305 FALSE
306 FALSE
307 FALSE
308 FALSE
309 FALSE
310 FALSE
311 FALSE
312 FALSE
313 FALSE
314 NA
315 FALSE
316 FALSE
317 FALSE
318 FALSE
319 FALSE
320 FALSE
321 FALSE
322 FALSE
323 FALSE
324 NA
325 NA
326 NA
327 FALSE
328 FALSE
329 FALSE
330 FALSE
331 FALSE
332 FALSE
333 FALSE
334 FALSE
335 FALSE
336 FALSE
337 FALSE
338 FALSE
339 FALSE
340 FALSE
341 FALSE
342 FALSE
343 FALSE
344 NA
345 FALSE
346 FALSE
347 FALSE
348 FALSE
349 NA
350 FALSE
351 FALSE
352 FALSE
353 NA
354 FALSE
355 FALSE
356 FALSE
357 FALSE
358 FALSE
359 FALSE
360 FALSE
361 FALSE
362 FALSE
363 NA
364 FALSE
365 FALSE
366 FALSE
367 FALSE
368 FALSE
369 FALSE
370 FALSE
371 FALSE
372 FALSE
373 FALSE
374 FALSE
375 FALSE
376 FALSE
377 FALSE
378 FALSE
379 FALSE
380 FALSE
381 FALSE
382 FALSE
383 FALSE
384 FALSE
385 FALSE
386 FALSE
387 FALSE
388 FALSE
389 FALSE
390 NA
391 FALSE
392 FALSE
393 NA
394 FALSE
395 FALSE
396 FALSE
397 FALSE
398 FALSE
399 FALSE
400 FALSE
401 NA
402 NA
403 NA
404 FALSE
405 FALSE
406 NA
407 FALSE
408 FALSE
409 FALSE
410 FALSE
411 FALSE
412 FALSE
413 FALSE
414 FALSE
415 FALSE
416 NA
417 NA
418 NA
419 NA
420 FALSE
421 FALSE
422 FALSE
423 FALSE
424 FALSE
425 FALSE
426 FALSE
427 FALSE
428 FALSE
429 FALSE
430 FALSE
431 FALSE
432 FALSE
433 FALSE
434 FALSE
435 NA
436 FALSE
437 FALSE
438 FALSE
439 FALSE
440 FALSE
441 FALSE
442 FALSE
443 FALSE
444 FALSE
445 FALSE
446 FALSE
447 FALSE
448 FALSE
449 FALSE
450 FALSE
451 FALSE
452 FALSE
453 FALSE
454 FALSE
455 NA
456 NA
457 NA
458 NA
459 NA
460 FALSE
461 FALSE
462 FALSE
463 FALSE
464 FALSE
465 NA
466 FALSE
467 FALSE
468 FALSE
469 NA
470 FALSE
471 FALSE
472 FALSE
473 FALSE
474 FALSE
475 NA
476 FALSE
477 FALSE
478 FALSE
479 FALSE
480 FALSE
481 FALSE
482 FALSE
483 FALSE
484 FALSE
485 FALSE
486 NA
487 FALSE
488 FALSE
489 FALSE
490 FALSE
491 FALSE
492 FALSE
493 FALSE
494 FALSE
495 NA
496 FALSE
497 FALSE
498 FALSE
499 FALSE
500 FALSE
501 FALSE
502 FALSE
503 NA
Using the surveys_combined data frame, inspect the variable hindfoot length for possible univariate outliers. If you detect any outliers use any of the methods outlined in the Module 6 notes to deal with them. Explain briefly the actions that you take to handle outliers.
boxplot(surveys_combined$hindfoot_length, main = "Box Plot of hindfoot length")
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ), na.rm = T)
x[ x < quantiles[2] - 1.5*IQR(x, na.rm = T) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x, na.rm = T) ] <- quantiles[4]
x
}
surveys_combined$hindfoot_length %>% cap()
[1] 32 33 37 36 35 14 NA 37 34 20 53 38 35 NA 36 36 48 22 NA 48 34 31 36 21 35 31 36 38 NA 52 37 35 36 NA 38 22
[37] 35 33 36 36 34 46 36 35 36 35 32 36 17 32 36 26 36 37 36 34 NA 45 33 20 35 35 35 37 34 35 35 32 15 21 36 31
[73] 44 12 32 47 NA 16 34 48 14 35 37 35 35 33 11 35 20 35 50 35 NA 36 38 36 36 38 37 54 35 35 35 43 35 NA NA 21
[109] 35 NA 37 37 13 32 34 37 19 33 35 35 NA 33 37 36 35 30 33 34 34 34 37 33 37 45 35 16 35 37 15 33 36 34 35 40
[145] 37 36 37 37 50 35 35 38 50 47 NA 35 36 36 53 37 37 39 21 36 50 13 36 36 37 36 NA NA 36 37 33 45 NA 33 37 34
[181] 35 49 37 52 19 34 36 48 33 36 47 36 NA NA 50 15 37 35 36 52 47 32 38 37 50 52 53 37 35 36 48 NA 35 NA 36 13
[217] 19 13 36 52 NA 52 50 NA NA 37 NA 38 35 NA NA 48 25 35 37 47 50 20 48 53 NA 19 38 36 37 37 54 NA 36 51 36 53
[253] 38 36 35 36 37 36 36 36 36 19 37 49 38 NA 46 36 37 36 50 NA 37 20 37 50 20 50 19 37 NA NA 34 37 36 50 38 37
[289] 38 33 47 36 36 37 36 18 NA 36 19 NA 48 36 36 37 35 33 36 48 49 NA 18 36 35 NA 36 37 32 33 36 35 NA 47 33 NA
[325] NA NA 50 NA 47 37 50 16 NA 37 52 36 35 37 49 36 50 34 NA NA 34 37 38 36 NA 37 NA 36 NA 38 NA 32 50 16 36 35
[361] 36 51 NA 37 38 NA 51 16 36 35 36 36 34 37 15 15 48 33 36 33 48 37 52 36 50 NA NA 36 NA NA 15 53 NA 48 36 36
[397] 16 50 36 NA NA NA NA 37 NA NA 50 9 36 53 36 52 49 34 36 NA NA NA NA 38 51 37 36 37 36 36 37 33 NA 15 NA NA
[433] 38 16 NA 37 37 36 36 16 NA NA 35 38 NA NA 37 51 35 33 51 NA 36 36 NA NA NA NA NA NA 36 36 49 49 NA 51 32 35
[469] NA 15 NA 49 NA 15 NA NA 47 NA 36 47 35 37 49 37 52 NA 38 37 NA 37 48 49 49 NA NA 36 NA 16 51 NA NA 36 NA NA
[505] 36 NA 35 36 NA 16 36 48 NA 16 NA 38 26 36 38 15 36 37 36 NA 34 51 37 51 36 37 NA NA NA NA NA NA NA 36 34 47
[541] 36 36 49 NA 38 NA NA 53 38 36 50 51 50 NA 48 36 37 49 36 50 53 NA NA 51 NA 36 36 NA NA NA NA 38 NA 34 38 36
[577] NA NA NA NA NA NA NA NA 53 35 NA NA 39 16 38 39 38 38 37 NA 33 NA 15 37 36 37 NA 22 NA NA 36 34 49 37 20 NA
[613] 49 49 49 36 38 47 39 49 NA 46 NA 35 35 20 NA NA NA 18 20 NA 55 NA 38 36 38 37 35 50 38 36 35 38 38 32 52 37
[649] 35 15 38 14 47 37 32 49 54 21 20 37 NA 15 18 35 35 21 35 21 50 20 20 15 NA 49 NA NA 52 49 49 NA NA NA NA 35
[685] NA NA NA 48 53 36 50 NA NA 37 NA 37 36 NA 36 37 NA 35 NA 35 NA NA NA 20 NA 49 37 36 15 NA 36 35 NA 35 37 47
[721] 37 NA 16 37 NA NA NA NA 36 NA 47 19 33 36 18 NA 36 NA 19 NA 48 18 36 21 37 20 35 NA NA NA 35 49 36 37 32 20
[757] NA 34 49 48 NA NA 50 35 36 35 NA NA NA 37 35 NA 37 36 37 37 20 36 NA NA 37 37 47 NA 39 52 50 36 52 50 53 34
[793] 49 35 33 50 50 48 36 35 46 NA 48 34 47 47 35 51 47 NA NA NA NA 55 38 46 37 49 NA 35 34 48 33 38 NA 20 36 NA
[829] 48 38 NA NA 14 38 53 49 48 37 51 37 NA 38 50 46 32 NA 35 32 36 48 46 36 49 36 NA 37 NA NA 13 NA 19 52 48 NA
[865] 37 37 37 50 33 49 NA 52 50 37 33 36 50 NA NA NA 35 52 20 49 36 NA 34 34 50 49 35 50 48 35 50 50 21 35 33 37
[901] 48 NA 35 52 33 34 36 33 36 37 48 NA 37 NA 18 50 52 21 35 15 48 50 34 37 36 35 48 NA 48 36 46 37 20 47 NA NA
[937] NA 38 51 51 36 32 NA 50 49 36 33 37 49 34 21 34 52 36 47 49 33 36 47 NA 36 47 50 36 NA 35 49 13 15 NA 32 NA
[973] 47 49 35 15 34 18 NA 51 21 51 NA NA NA NA 36 NA 50 NA 23 38 54 33 16 48 49 51 34 46
[ reached getOption("max.print") -- omitted 34549 entries ]