This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
Chapter 9, Advanced dataframe manipulation
#Task 1: Downloading the dataframe
pirates.errors <- read.table("http://nathanieldphillips.com/wp-content/uploads/2015/05/pirate_survey_witherrors.txt", header = T , sep= "\t", stringsAsFactors=F)
#Task 2: Cleaning up the dataframe
pirates.errors$sex [!(pirates.errors$sex %in% c("male","female","other/NA"))] <- NA
table(pirates.errors$sex)
##
## female male
## 466 490
pirates.errors$headband [!(pirates.errors$headband %in% c ("no","yes"))] <- NA
table(pirates.errors$headband) # what is a headband and sometimes should become NA
##
## no yes
## 97 893
pirates.errors$age [!(pirates.errors$age %in% seq (1, 100,1))] <- NA
table(pirates.errors$age) # all values lower than 0 and higher than 100 should be marked as NA
##
## 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
## 1 1 5 1 2 4 5 7 5 19 20 33 35 38 55 73 52 63 55 58 69 70 57 56 40
## 34 35 36 37 38 39 40 41 42 43 45 46 48
## 45 32 32 12 10 5 5 6 4 2 1 1 1
pirates.errors$tattoos [!(pirates.errors$tattoos %in% seq (1, 130, 1))] <- NA
table(pirates.errors$tattoos)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 5 13 24 24 39 60 73 124 119 110 128 91 73 46 21 15 9 6
## 19
## 2
pirates.errors$favorite.pirate [!(pirates.errors$favorite.pirate %in% c ("Anicetus", "Blackbeard", "Edward Low", "Hook", "Jack Sparrow", "Lewis Scot") )] <- NA
table(pirates.errors$favorite.pirate)
##
## Anicetus Blackbeard Edward Low Hook Jack Sparrow
## 117 100 113 114 450
## Lewis Scot
## 96
pirates.errors$sword.type [!(pirates.errors$sword.type %in% c ("cutlass", "sabre", "scimitar"))] <- NA # I was not sure whether banana should be marked as NA or not, I decided to leave it out
table(pirates.errors$sword.type)
##
## cutlass sabre scimitar
## 842 62 57
#Task 3
pirates <- read.table("http://nathanieldphillips.com/wp-content/uploads/2015/05/pirate_survey_noerrors.txt", sep = "\t", header = T, stringsAsFactors = F)
# Mean number of treasure tchests foudn by males, females and other
# I often used the function view to look at the data, but I had to leave it out, because otherwise the document would not knit
aggregate(formula= tchests.found ~ sex, FUN= mean, na.rm= T, data= pirates)
## sex tchests.found
## 1 female 7.353319
## 2 male 7.128049
## 3 other 8.048780
# Task 4 Calculating the median sword.speed for each sword.type
aggregate(formula= sword.speed ~ sword.type, FUN= median, na.rm= T, data= pirates)
## sword.type sword.speed
## 1 banana 2.5859139
## 2 cutlass 0.4848266
## 3 sabre 1.7393120
## 4 scimitar 1.7559671
# highest banana, lowest cutlass
# Task 5 first way
aggregated.headband <-aggregate(formula= sword.speed ~ headband, FUN= median, na.rm= T, data= pirates)
# Pirates not wearing a headband had a higher sword.speed (no = 1.0780988)
# Task 5 second way
aggregated.swordspeed <- aggregate(formula=sword.speed ~ headband+sword.type, FUN= median,na.rm= T, data= pirates)
# There are some differences between the conclusions you draw from the first way compared to the second way. This time you would derive from the table that those pirates wearing a headband and using the sword type banana have the highest number for the column word speed (7.9722183) and those pirates not wearing a headband and using the sword type cutlass have the lowest number for word speed (0.3408127). Compared to the first way this is different as the fact of wearing a headband was just the other way around: those pirates wearing a headband had a higher sword speed than those not wearing one (no = 1.078098, yes= 0.5375353).
# Task 6
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
favorite.pirates.agg <- pirates %>%
group_by(favorite.pirate) %>%
summarise(frequency = n() ,tattoos.mean=mean(tattoos, na.rm= T), sword.speed.med = median(sword.speed, na.rm= T)
)
favorite.pirates.agg
## Source: local data frame [6 x 4]
##
## favorite.pirate frequency tattoos.mean sword.speed.med
## 1 Anicetus 120 9.100000 0.4776414
## 2 Blackbeard 100 9.620000 0.7313608
## 3 Edward Low 114 9.342105 0.5371732
## 4 Hook 115 9.713043 0.6046085
## 5 Jack Sparrow 453 9.607064 0.5391035
## 6 Lewis Scot 98 9.081633 0.5539311
# Task 7
aggregate.college <- aggregate(formula= college ~ age, FUN= median, data= pirates, na.rm= T)
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
aggregate.college
## age college
## 1 9 CCCC
## 2 10 CCCC
## 3 11 CCCC
## 4 12 CCCC
## 5 13 <NA>
## 6 14 <NA>
## 7 15 CCCC
## 8 16 <NA>
## 9 17 CCCC
## 10 18 CCCC
## 11 19 <NA>
## 12 20 CCCC
## 13 21 CCCC
## 14 22 <NA>
## 15 23 CCCC
## 16 24 CCCC
## 17 25 CCCC
## 18 26 <NA>
## 19 27 <NA>
## 20 28 <NA>
## 21 29 CCCC
## 22 30 CCCC
## 23 31 <NA>
## 24 32 <NA>
## 25 33 JSSFP
## 26 34 JSSFP
## 27 35 JSSFP
## 28 36 <NA>
## 29 37 <NA>
## 30 38 <NA>
## 31 39 JSSFP
## 32 40 JSSFP
## 33 41 <NA>
## 34 42 <NA>
## 35 43 <NA>
## 36 45 JSSFP
## 37 46 JSSFP
## 38 48 JSSFP
# Task 8
plot(x=pirates$tattoos, y=pirates$tchests.found, main="scatterplot", xlab= "tattoos", ylab= "tchests found", pch= 16, cex=1, col="lightgray", type= "p")
pirates$tattoos.cut5 <- cut(pirates$tattoos, seq(0,20,5))
pirates$tattoos.cut5
## [1] (15,20] (5,10] (10,15] (5,10] (10,15] (5,10] (10,15] (5,10]
## [9] (10,15] (10,15] (5,10] (10,15] (10,15] (5,10] (5,10] (5,10]
## [17] (10,15] (5,10] (0,5] (10,15] (5,10] (10,15] (15,20] (0,5]
## [25] (5,10] (5,10] (5,10] (10,15] (5,10] (10,15] (5,10] (5,10]
## [33] (5,10] (5,10] (10,15] (10,15] (5,10] (10,15] <NA> (10,15]
## [41] (5,10] (10,15] (5,10] (5,10] (0,5] (15,20] (10,15] (5,10]
## [49] (10,15] (0,5] (5,10] (15,20] (10,15] (5,10] (0,5] (0,5]
## [57] (5,10] (10,15] (5,10] (10,15] (5,10] (10,15] (10,15] (0,5]
## [65] (10,15] (5,10] (5,10] (5,10] (10,15] (0,5] (0,5] (10,15]
## [73] (5,10] (0,5] (5,10] (10,15] (10,15] (5,10] (5,10] <NA>
## [81] (5,10] (0,5] (5,10] (5,10] (0,5] (0,5] <NA> (10,15]
## [89] (5,10] (5,10] (15,20] (5,10] (5,10] (5,10] (0,5] (15,20]
## [97] (10,15] (0,5] (10,15] (5,10] (10,15] (5,10] (10,15] (5,10]
## [105] (5,10] (15,20] (5,10] (10,15] (10,15] (5,10] (10,15] (5,10]
## [113] (5,10] (10,15] (10,15] (10,15] (5,10] (5,10] (10,15] (5,10]
## [121] (10,15] (10,15] (10,15] (10,15] (5,10] (10,15] (5,10] (10,15]
## [129] (10,15] (5,10] (10,15] (10,15] (5,10] (5,10] (5,10] (0,5]
## [137] (5,10] (10,15] (5,10] (5,10] (10,15] (5,10] (0,5] (10,15]
## [145] (0,5] (5,10] (10,15] (5,10] (5,10] (10,15] <NA> (5,10]
## [153] (0,5] (10,15] (5,10] (5,10] (5,10] (10,15] (5,10] (5,10]
## [161] (5,10] (10,15] (5,10] (5,10] (5,10] (5,10] (5,10] (5,10]
## [169] (0,5] (0,5] (0,5] (10,15] (5,10] (10,15] (5,10] (10,15]
## [177] (15,20] (10,15] (5,10] (0,5] (10,15] (10,15] (5,10] (5,10]
## [185] (5,10] (5,10] (10,15] (5,10] (10,15] (10,15] (10,15] (10,15]
## [193] (5,10] (10,15] (15,20] (5,10] (10,15] (5,10] (0,5] (10,15]
## [201] (5,10] (0,5] (15,20] (5,10] (0,5] (10,15] (5,10] (10,15]
## [209] (0,5] (10,15] (5,10] (10,15] (5,10] (5,10] (5,10] (5,10]
## [217] (0,5] (5,10] (5,10] (5,10] (10,15] (10,15] (10,15] (10,15]
## [225] (0,5] (10,15] (5,10] (5,10] (5,10] (10,15] (5,10] (5,10]
## [233] (10,15] (5,10] (5,10] (10,15] (10,15] (5,10] (5,10] (0,5]
## [241] (10,15] (10,15] (0,5] (10,15] (0,5] (10,15] (5,10] (10,15]
## [249] (5,10] (5,10] (10,15] (5,10] (0,5] (10,15] (0,5] (5,10]
## [257] (5,10] (0,5] (5,10] (5,10] (10,15] (5,10] (10,15] (10,15]
## [265] (0,5] (5,10] (15,20] (10,15] (0,5] (0,5] (5,10] (10,15]
## [273] (0,5] (5,10] (5,10] (10,15] (10,15] (10,15] (5,10] (10,15]
## [281] (10,15] (10,15] (10,15] (0,5] (0,5] (10,15] (5,10] (5,10]
## [289] (5,10] (10,15] (0,5] (10,15] (10,15] (10,15] (0,5] (0,5]
## [297] (5,10] (5,10] (5,10] (5,10] (5,10] (10,15] (5,10] (5,10]
## [305] (10,15] (5,10] (10,15] (10,15] (5,10] (0,5] (10,15] (10,15]
## [313] (5,10] (10,15] (5,10] (10,15] (10,15] (5,10] (0,5] (5,10]
## [321] (10,15] (10,15] (5,10] (10,15] (10,15] (10,15] (10,15] (5,10]
## [329] (5,10] (5,10] (5,10] (10,15] (0,5] (10,15] (10,15] (10,15]
## [337] (5,10] (5,10] (5,10] (5,10] (5,10] (5,10] (5,10] (5,10]
## [345] (10,15] (10,15] (10,15] (5,10] (5,10] (5,10] (15,20] (5,10]
## [353] (10,15] (10,15] (10,15] (10,15] (5,10] (10,15] (0,5] (10,15]
## [361] (5,10] (5,10] (5,10] (10,15] (5,10] (5,10] (0,5] (5,10]
## [369] (5,10] (5,10] (5,10] (5,10] (10,15] (10,15] (5,10] (5,10]
## [377] (10,15] (10,15] (10,15] (10,15] (5,10] (5,10] (5,10] (10,15]
## [385] (10,15] (0,5] (5,10] (10,15] (0,5] (5,10] (10,15] (5,10]
## [393] (10,15] (5,10] (5,10] (10,15] (10,15] (10,15] (10,15] (5,10]
## [401] (10,15] (10,15] (5,10] (10,15] (10,15] (10,15] (5,10] (5,10]
## [409] (0,5] (10,15] (5,10] (5,10] (5,10] (10,15] (5,10] (5,10]
## [417] (5,10] (10,15] (5,10] (5,10] (5,10] (0,5] (5,10] (10,15]
## [425] (5,10] (10,15] (10,15] (5,10] (10,15] (5,10] (5,10] (0,5]
## [433] (5,10] (0,5] (10,15] (5,10] (5,10] (10,15] (5,10] (10,15]
## [441] (5,10] (10,15] (5,10] (15,20] (5,10] (5,10] (5,10] (10,15]
## [449] (5,10] (10,15] (10,15] (5,10] (10,15] (15,20] (5,10] (10,15]
## [457] (10,15] (5,10] (10,15] (5,10] (5,10] (5,10] (5,10] (5,10]
## [465] (10,15] (5,10] (5,10] (0,5] (5,10] (10,15] (5,10] (15,20]
## [473] (0,5] (5,10] (10,15] (10,15] (5,10] (0,5] (5,10] (5,10]
## [481] (10,15] (10,15] (10,15] (5,10] (0,5] (10,15] (10,15] (5,10]
## [489] (10,15] (5,10] (5,10] (10,15] (5,10] (5,10] (0,5] (5,10]
## [497] (5,10] (15,20] (5,10] (5,10] (10,15] (5,10] (10,15] (0,5]
## [505] (5,10] (5,10] (10,15] (10,15] (10,15] (5,10] (5,10] (0,5]
## [513] (5,10] (10,15] (0,5] (0,5] (10,15] (5,10] (5,10] (5,10]
## [521] (10,15] (5,10] (5,10] (5,10] (5,10] (10,15] (5,10] (15,20]
## [529] (10,15] (10,15] (10,15] (5,10] (10,15] (5,10] (10,15] (10,15]
## [537] (5,10] (10,15] (10,15] <NA> (5,10] (15,20] (10,15] (10,15]
## [545] (10,15] (10,15] (10,15] (10,15] (10,15] (10,15] (5,10] (5,10]
## [553] (5,10] (5,10] (5,10] (10,15] (10,15] (5,10] (5,10] (10,15]
## [561] (10,15] (10,15] (10,15] (5,10] (10,15] (0,5] (10,15] (5,10]
## [569] (5,10] (10,15] (0,5] (5,10] (10,15] (10,15] (5,10] (5,10]
## [577] (10,15] (10,15] (5,10] (10,15] (5,10] (5,10] (5,10] (0,5]
## [585] (5,10] (10,15] (5,10] (5,10] (5,10] (5,10] (10,15] (5,10]
## [593] (5,10] (5,10] (10,15] (10,15] (10,15] (5,10] (10,15] (5,10]
## [601] (5,10] (5,10] (5,10] (0,5] (5,10] (10,15] (10,15] (10,15]
## [609] (10,15] <NA> (10,15] (5,10] (15,20] (5,10] (0,5] (5,10]
## [617] (0,5] (5,10] (0,5] (5,10] (5,10] (5,10] (10,15] (5,10]
## [625] (5,10] (5,10] (5,10] (5,10] (5,10] (5,10] (0,5] (10,15]
## [633] (5,10] (10,15] (5,10] (10,15] (15,20] (10,15] (5,10] (10,15]
## [641] (5,10] (5,10] (5,10] (10,15] (5,10] (5,10] (10,15] (0,5]
## [649] (10,15] (10,15] (5,10] (0,5] (10,15] (10,15] (5,10] (5,10]
## [657] (10,15] (10,15] (5,10] (10,15] (5,10] (10,15] (0,5] (10,15]
## [665] (5,10] (5,10] (0,5] (5,10] (5,10] (5,10] (15,20] (5,10]
## [673] (5,10] (15,20] (5,10] (10,15] (5,10] (5,10] (10,15] (10,15]
## [681] (5,10] (5,10] (5,10] (5,10] (5,10] (5,10] (5,10] (15,20]
## [689] (10,15] (5,10] (5,10] (15,20] (10,15] (0,5] (5,10] (5,10]
## [697] (5,10] (5,10] (10,15] (5,10] (5,10] (10,15] (5,10] (5,10]
## [705] (10,15] (5,10] (5,10] (5,10] (10,15] (5,10] (10,15] (5,10]
## [713] (10,15] (5,10] (5,10] (10,15] (5,10] (10,15] (5,10] (5,10]
## [721] (10,15] (5,10] (5,10] (5,10] (5,10] (5,10] (0,5] (5,10]
## [729] (5,10] (10,15] (5,10] (10,15] (5,10] (10,15] (5,10] (5,10]
## [737] (5,10] (10,15] (5,10] (10,15] (5,10] (0,5] (10,15] (5,10]
## [745] (0,5] (5,10] (5,10] (5,10] (5,10] (0,5] (5,10] (5,10]
## [753] (10,15] (5,10] (10,15] (10,15] (5,10] (10,15] (5,10] (10,15]
## [761] (10,15] (5,10] (5,10] (5,10] (10,15] (10,15] (5,10] (10,15]
## [769] (5,10] (10,15] (0,5] (5,10] (0,5] (0,5] (5,10] (5,10]
## [777] (5,10] (5,10] (5,10] (5,10] (10,15] (5,10] (5,10] (5,10]
## [785] (5,10] (10,15] (10,15] (10,15] (5,10] (5,10] (10,15] (15,20]
## [793] (10,15] (5,10] (5,10] (10,15] (5,10] (5,10] (0,5] (0,5]
## [801] (5,10] (10,15] (0,5] (5,10] (5,10] (5,10] (10,15] (5,10]
## [809] (0,5] (5,10] (10,15] (10,15] (10,15] (10,15] (10,15] (10,15]
## [817] (10,15] (5,10] (5,10] (10,15] (15,20] (10,15] (10,15] (5,10]
## [825] (5,10] (5,10] (10,15] (5,10] (10,15] (0,5] (5,10] (10,15]
## [833] (10,15] (10,15] (5,10] (5,10] (5,10] (10,15] (15,20] (5,10]
## [841] (10,15] (5,10] (15,20] (0,5] (5,10] (10,15] (5,10] (5,10]
## [849] (10,15] (5,10] (0,5] (10,15] (10,15] (5,10] (5,10] (5,10]
## [857] (10,15] (10,15] (5,10] (10,15] (10,15] (5,10] (5,10] (5,10]
## [865] (5,10] (5,10] (5,10] (0,5] (10,15] (5,10] (0,5] (5,10]
## [873] (5,10] (10,15] (10,15] (5,10] (10,15] (5,10] (10,15] (10,15]
## [881] (5,10] (10,15] (15,20] (10,15] (10,15] (5,10] (10,15] (10,15]
## [889] (10,15] (0,5] (10,15] (5,10] (5,10] (10,15] (5,10] (5,10]
## [897] (10,15] (5,10] (5,10] (10,15] (10,15] (5,10] (5,10] (5,10]
## [905] (5,10] (5,10] (0,5] (5,10] (10,15] (5,10] (5,10] (10,15]
## [913] (15,20] (10,15] (10,15] (5,10] (10,15] (5,10] (5,10] (10,15]
## [921] (10,15] (0,5] (5,10] (5,10] (5,10] (5,10] (15,20] (5,10]
## [929] (5,10] (10,15] (5,10] (5,10] (5,10] (5,10] (0,5] (5,10]
## [937] (5,10] (5,10] (10,15] (0,5] (5,10] (5,10] (15,20] (5,10]
## [945] (5,10] (5,10] (10,15] (10,15] (5,10] (5,10] (5,10] (5,10]
## [953] (10,15] (10,15] (0,5] (5,10] (5,10] (10,15] (5,10] (0,5]
## [961] (10,15] (0,5] (10,15] (5,10] (5,10] (5,10] (5,10] (10,15]
## [969] (5,10] (5,10] (0,5] (10,15] (0,5] (5,10] (5,10] (5,10]
## [977] (5,10] (0,5] (5,10] (10,15] (0,5] (5,10] (10,15] (10,15]
## [985] <NA> (5,10] (5,10] (10,15] (10,15] (10,15] (5,10] (5,10]
## [993] (10,15] (5,10] (10,15] (5,10] (10,15] (10,15] (5,10] <NA>
## Levels: (0,5] (5,10] (10,15] (15,20]
aggregate.tchests <- aggregate(formula=tchests.found ~ tattoos.cut5, FUN= median, data= pirates, na.rm= T)
aggregate.tchests
## tattoos.cut5 tchests.found
## 1 (0,5] 4.0
## 2 (5,10] 5.0
## 3 (10,15] 5.0
## 4 (15,20] 7.5
plot(x=pirates$tattoos.cut5, y=pirates$tchests.found, main="scatterplot", xlab= "Tattoos", ylab= "median tchests found", pch= 16, cex=1, col="lightgray", type= "p", xlim=c(0,5), ylim=c(0,9))