Exercise 1 Download the dataframe pirates.txt from http://nathanieldphillips.com/wp-content/uploads/2015/05/pirate_survey.txt. The data are stored in a comma-separated text file with headers. Load the dataframe into an object called pirates.

pirates <- read.table(
  file= "http://nathanieldphillips.com/wp-content/uploads/2015/05/pirate_survey.txt", 
  header=T, 
  sep ="\t", 
  stringsAsFactors=F)

head(pirates)
##   id    sex headband age college tattoos tchests.found parrots.lifetime
## 1  1 female      yes  37   JSSFP       9             6                2
## 2  2 female      yes  34   JSSFP      14             6                5
## 3  3   male       no  29    CCCC       9             1               13
## 4  4 female      yes  30   JSSFP       9             1                5
## 5  5   male      yes  16    CCCC      12             4                1
## 6  6   male      yes  27    CCCC       6             1                1
##   favorite.pirate sword.type sword.speed
## 1            Hook    cutlass   0.2572612
## 2      Lewis Scot    cutlass   0.2935024
## 3    Jack Sparrow   scimitar   0.8698797
## 4      Lewis Scot    cutlass   0.1832235
## 5            Hook    cutlass   0.3754281
## 6    Jack Sparrow    cutlass   0.1542810

Exercise 2 Let’s clean up the dataframe. Some of the values don’t seem to be appropriate. For example, when I look at the column sex, I see some bad values. For each of the columns, try to figure out which values are appropriate (hint: use table()), and recode all inappropriate values as NA.

table(pirates$sex)
## 
## female   male  other 
##    482    477     41
pirates$sex[!(pirates$sex %in% c("male", "female", "other/NA"))] <- NA
table(pirates$sex)
## 
## female   male 
##    482    477
table(pirates$sex, useNA = "always")
## 
## female   male   <NA> 
##    482    477     41
table(pirates$headband)
## 
##  no yes 
##  97 903
pirates$headband[!(pirates$headband %in% c("yes", "no"))] <- NA
table(pirates$headband, useNA = "always")
## 
##   no  yes <NA> 
##   97  903    0
summary(pirates$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.00   24.00   28.00   27.64   31.00   46.00
pirates$age[!(pirates$age %in% c(15:120))] <- NA
summary(pirates$age, useNA = "always")
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   15.00   24.00   28.00   27.77   31.00   46.00       8
table(pirates$age, useNA = "always")
## 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29 
##    3   13   11   21   22   30   49   42   44   48   58   64   69   74   72 
##   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44 
##   76   56   34   46   42   27   30   16   15   11    9    3    1    1    3 
##   45   46 <NA> 
##    1    1    8
table(pirates$college)
## 
##  CCCC JSSFP 
##   653   347
table(pirates$tattoos)
## 
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   4   7   7  19  41  47  48  74 100 123 138 101 104  84  51  23  13  11 
##  18  20 
##   3   2
summary(pirates$tattoos)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   8.000  10.000   9.569  12.000  20.000
pirates$tattoos[!(pirates$tattoos %in% c(0:1000))] <- NA
summary(pirates$tattoos)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   8.000  10.000   9.569  12.000  20.000
table(pirates$tattoos, useNA = "always")
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
##    4    7    7   19   41   47   48   74  100  123  138  101  104   84   51 
##   15   16   17   18   20 <NA> 
##   23   13   11    3    2    0
summary(pirates$tchests.found)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   5.000   7.414  11.000  51.000
summary(pirates$parrots.lifetime)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   2.000   2.825   4.000  25.000
pirates$parrots.lifetime[!(pirates$parrots.lifetime %in% c(1:22))] <- NA
summary(pirates$parrots.lifetime)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00    1.00    3.00    3.41    4.00   19.00     179
table(pirates$parrots.lifetime, useNA = "always")
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   17 
##  230  176  130   84   54   42   34   21   17    9    7    3    7    2    3 
##   19 <NA> 
##    2  179
table(pirates$favorite.pirate)
## 
##     Anicetus   Blackbeard   Edward Low         Hook Jack Sparrow 
##          146          107          105          102          426 
##   Lewis Scot 
##          114
pirates$favorite.pirate[!(pirates$favorite.pirate %in% c("Anicetus", "Blackbeard", "Edward Low", "Hook", "Jack Sparrow", "Lewis Scot"))] <- NA
table(pirates$favorite.pirate, useNA = "always")
## 
##     Anicetus   Blackbeard   Edward Low         Hook Jack Sparrow 
##          146          107          105          102          426 
##   Lewis Scot         <NA> 
##          114            0
head(pirates)
##   id    sex headband age college tattoos tchests.found parrots.lifetime
## 1  1 female      yes  37   JSSFP       9             6                2
## 2  2 female      yes  34   JSSFP      14             6                5
## 3  3   male       no  29    CCCC       9             1               13
## 4  4 female      yes  30   JSSFP       9             1                5
## 5  5   male      yes  16    CCCC      12             4                1
## 6  6   male      yes  27    CCCC       6             1                1
##   favorite.pirate sword.type sword.speed
## 1            Hook    cutlass   0.2572612
## 2      Lewis Scot    cutlass   0.2935024
## 3    Jack Sparrow   scimitar   0.8698797
## 4      Lewis Scot    cutlass   0.1832235
## 5            Hook    cutlass   0.3754281
## 6    Jack Sparrow    cutlass   0.1542810
table(pirates$sword.type)
## 
##   banana  cutlass    sabre scimitar 
##       42      827       65       66
summary(pirates$sword.speed)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##  0.001725  0.209800  0.504700  1.051000  1.123000 19.040000

Exercise 3 A fellow pirate captain wants to know if there is a relationship between the sex of my pirates and the number of treasure chests they have found. Using aggregate() figure out the mean number of treasure chests found by males, females, and other/NA.

table(pirates$sex)
## 
## female   male 
##    482    477
aggregated.data <- aggregate(tchests.found ~ sex, FUN = mean, na.rm = T, data = pirates)
aggregated.data
##      sex tchests.found
## 1 female      8.095436
## 2   male      6.761006

Exercise 4 Each pirate only uses one kind of sword - and their speed with their preferred sword is represented in sword.speed Which sword types tend to have the fastest (i.e.; smallest) sword speed? Test this by calculating the median sword.speed for each sword type

head(pirates)
##   id    sex headband age college tattoos tchests.found parrots.lifetime
## 1  1 female      yes  37   JSSFP       9             6                2
## 2  2 female      yes  34   JSSFP      14             6                5
## 3  3   male       no  29    CCCC       9             1               13
## 4  4 female      yes  30   JSSFP       9             1                5
## 5  5   male      yes  16    CCCC      12             4                1
## 6  6   male      yes  27    CCCC       6             1                1
##   favorite.pirate sword.type sword.speed
## 1            Hook    cutlass   0.2572612
## 2      Lewis Scot    cutlass   0.2935024
## 3    Jack Sparrow   scimitar   0.8698797
## 4      Lewis Scot    cutlass   0.1832235
## 5            Hook    cutlass   0.3754281
## 6    Jack Sparrow    cutlass   0.1542810
sword.mean <- aggregate(sword.speed ~ sword.type, FUN = median, na.rm = T, data = pirates)
sword.mean
##   sword.type sword.speed
## 1     banana   1.9427477
## 2    cutlass   0.3979775
## 3      sabre   1.5646515
## 4   scimitar   2.7334913

Exercise 5 Is there a relationship between whether or not a pirate wears a headband and their speed with their sword? Test this in two ways.

First, calculate the median sword speed, separately for each level of headband use using aggregate(). What is your conclusion?

Second, calculate the median sword speed for all combinations of both sex AND sword.type. That is, calculate the median sword.speed for headband-wearers who use cutlasses, headband-wearers who use sabres, … headband-nonwearers who use cutlasses, headband-nonwearers who use sabres… (hint: include two independent variables in the formula argument to aggregate()). Does your conclusion change? If so, what do you think is going on?

sword.headband <- aggregate(sword.speed ~ headband, FUN = median, na.rm = T, data = pirates)
sword.headband
##   headband sword.speed
## 1       no   1.1441930
## 2      yes   0.4584107
sword.sex.type <- aggregate(sword.speed ~ sex + sword.type, FUN = median, na.rm = T, data = pirates)
sword.sex.type
##      sex sword.type sword.speed
## 1 female     banana   2.0944283
## 2   male     banana   1.3423544
## 3 female    cutlass   0.4223560
## 4   male    cutlass   0.3811121
## 5 female      sabre   1.5193133
## 6   male      sabre   1.8346521
## 7 female   scimitar   2.8816917
## 8   male   scimitar   2.2276789
sword.speed.headband <- aggregate(sword.speed ~ headband + sword.type, FUN = median, na.rm = T, data = pirates)
sword.speed.headband
##   headband sword.type sword.speed
## 1       no     banana   1.0902691
## 2      yes     banana   6.7000876
## 3       no    cutlass   0.3638043
## 4      yes    cutlass   0.4020915
## 5       no      sabre   1.2050550
## 6      yes      sabre   1.9749380
## 7       no   scimitar   2.0775947
## 8      yes   scimitar   4.0558626
sword.all <- aggregate(sword.speed ~ headband + sword.type + sex, FUN = median, na.rm = T, data = pirates)
sword.all
##    headband sword.type    sex sword.speed
## 1        no     banana female   1.0902691
## 2       yes     banana female   6.4681032
## 3        no    cutlass female   0.3065877
## 4       yes    cutlass female   0.4339311
## 5        no      sabre female   1.2998394
## 6       yes      sabre female   1.9153348
## 7        no   scimitar female   2.5258784
## 8       yes   scimitar female   4.9011985
## 9        no     banana   male   1.0012506
## 10      yes     banana   male   6.7000876
## 11       no    cutlass   male   0.8152493
## 12      yes    cutlass   male   0.3784077
## 13       no      sabre   male   1.0841687
## 14      yes      sabre   male   2.3333113
## 15       no   scimitar   male   1.2479079
## 16      yes   scimitar   male   3.7374398

conculision: The pirates without a headband are fastern the pirates with a headband. Apparently the female pirate with a handband and a bana sword ist the fastest.

Exercise 6: Does a pirate’s favorite pirate say anything about them? Do pirates whose favorite pirate is Hook have more tattoos on average or a faster sword speed than those whose favorite pirate is Blackbeard? Using dplyr, create the following aggregated dataframe which shows aggregated data depending on the pirates’ favorite pirate. Here are the four basic steps

First, define the dataframe THEN… (%>%)
Second, group the data by favorite.pirate, THEN… (%>%)
Third, use the summary function to tell R you will summarise the data
Fourth, define each of the new summary statistics (then close the summarize function)
library("dplyr")
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
pirates.fav <- pirates %>% 
group_by(favorite.pirate) %>% summarise (tattoos.mean = mean(tattoos, na.rm = T), sword.speed.median = median(sword.speed, na.rm = T))

pirates.fav
## Source: local data frame [6 x 3]
## 
##   favorite.pirate tattoos.mean sword.speed.median
## 1        Anicetus     9.739726          0.5063631
## 2      Blackbeard     8.934579          0.5765946
## 3      Edward Low     9.285714          0.5187047
## 4            Hook     9.235294          0.5297751
## 5    Jack Sparrow     9.748826          0.5125712
## 6      Lewis Scot     9.833333          0.4175671