setwd("/Users/HollyJones/Desktop/EDA")
library(dplyr)
titanic = read.csv("TitanicSurvival.csv")
dim(titanic)
## [1] 1309 5
str(titanic)
## 'data.frame': 1309 obs. of 5 variables:
## $ X : Factor w/ 1309 levels "Abbing, Mr. Anthony",..: 22 24 25 26 27 31 46 47 51 55 ...
## $ survived : Factor w/ 2 levels "no","yes": 2 2 1 1 1 2 2 1 2 1 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ age : num 29 0.917 2 30 25 ...
## $ passengerClass: Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ...
Let's change the “X” variable to “names” (since that is what it represents) using Dplyr's “rename” function
Function syntax: dataset <- rename(dataset, NEW variable name = OLD variable name)
titanic <- rename(titanic, names=X)
head(titanic)
## names survived sex age passengerClass
## 1 Allen, Miss. Elisabeth Walton yes female 29.0000 1st
## 2 Allison, Master. Hudson Trevor yes male 0.9167 1st
## 3 Allison, Miss. Helen Loraine no female 2.0000 1st
## 4 Allison, Mr. Hudson Joshua Crei no male 30.0000 1st
## 5 Allison, Mrs. Hudson J C (Bessi no female 25.0000 1st
## 6 Anderson, Mr. Harry yes male 48.0000 1st
The select() function allows you to get the few columns you might need
Selecting the first 3 columns:
names(titanic)[1:3]
## [1] "names" "survived" "sex"
subset <- select(titanic,names:sex)
head(subset)
## names survived sex
## 1 Allen, Miss. Elisabeth Walton yes female
## 2 Allison, Master. Hudson Trevor yes male
## 3 Allison, Miss. Helen Loraine no female
## 4 Allison, Mr. Hudson Joshua Crei no male
## 5 Allison, Mrs. Hudson J C (Bessi no female
## 6 Anderson, Mr. Harry yes male
Resetting the data frame
subset <- select(titanic,names:passengerClass)
head(subset)
## names survived sex age passengerClass
## 1 Allen, Miss. Elisabeth Walton yes female 29.0000 1st
## 2 Allison, Master. Hudson Trevor yes male 0.9167 1st
## 3 Allison, Miss. Helen Loraine no female 2.0000 1st
## 4 Allison, Mr. Hudson Joshua Crei no male 30.0000 1st
## 5 Allison, Mrs. Hudson J C (Bessi no female 25.0000 1st
## 6 Anderson, Mr. Harry yes male 48.0000 1st
Omitting a given set of columns:
subset <- select(titanic, -(age:passengerClass))
head(subset)
## names survived sex
## 1 Allen, Miss. Elisabeth Walton yes female
## 2 Allison, Master. Hudson Trevor yes male
## 3 Allison, Miss. Helen Loraine no female
## 4 Allison, Mr. Hudson Joshua Crei no male
## 5 Allison, Mrs. Hudson J C (Bessi no female
## 6 Anderson, Mr. Harry yes male
Resetting the data frame
subset <- select(titanic,names:passengerClass)
head(subset)
## names survived sex age passengerClass
## 1 Allen, Miss. Elisabeth Walton yes female 29.0000 1st
## 2 Allison, Master. Hudson Trevor yes male 0.9167 1st
## 3 Allison, Miss. Helen Loraine no female 2.0000 1st
## 4 Allison, Mr. Hudson Joshua Crei no male 30.0000 1st
## 5 Allison, Mrs. Hudson J C (Bessi no female 25.0000 1st
## 6 Anderson, Mr. Harry yes male 48.0000 1st
The select() function also allows a special syntax that allows you to specify variable names based on patterns. So, for example, if you wanted to keep every variable that ends with a āsā, we could do:
subset <- select(titanic, ends_with("s"))
head(subset)
## names passengerClass
## 1 Allen, Miss. Elisabeth Walton 1st
## 2 Allison, Master. Hudson Trevor 1st
## 3 Allison, Miss. Helen Loraine 1st
## 4 Allison, Mr. Hudson Joshua Crei 1st
## 5 Allison, Mrs. Hudson J C (Bessi 1st
## 6 Anderson, Mr. Harry 1st
Resetting the data frame
subset <- select(titanic,names:passengerClass)
head(subset)
## names survived sex age passengerClass
## 1 Allen, Miss. Elisabeth Walton yes female 29.0000 1st
## 2 Allison, Master. Hudson Trevor yes male 0.9167 1st
## 3 Allison, Miss. Helen Loraine no female 2.0000 1st
## 4 Allison, Mr. Hudson Joshua Crei no male 30.0000 1st
## 5 Allison, Mrs. Hudson J C (Bessi no female 25.0000 1st
## 6 Anderson, Mr. Harry yes male 48.0000 1st
We could also use the “starts_with” syntax:
subset <- select(titanic, starts_with("s"))
head(subset)
## survived sex
## 1 yes female
## 2 yes male
## 3 no female
## 4 no male
## 5 no female
## 6 yes male
Resetting the data frame
subset <- select(titanic,names:passengerClass)
head(subset)
## names survived sex age passengerClass
## 1 Allen, Miss. Elisabeth Walton yes female 29.0000 1st
## 2 Allison, Master. Hudson Trevor yes male 0.9167 1st
## 3 Allison, Miss. Helen Loraine no female 2.0000 1st
## 4 Allison, Mr. Hudson Joshua Crei no male 30.0000 1st
## 5 Allison, Mrs. Hudson J C (Bessi no female 25.0000 1st
## 6 Anderson, Mr. Harry yes male 48.0000 1st
This is not part of Dplyr, but before we move on to the Filter function we should drop rows that contain NA values
This will give us the list of rows where “NA” was provided as a value and the corresponding count
subset[!complete.cases(subset),]
## names survived sex age passengerClass
## 16 Baumann, Mr. John D no male NA 1st
## 38 Bradley, Mr. George (George Ar yes male NA 1st
## 41 Brewe, Dr. Arthur Jackson no male NA 1st
## 47 Cairns, Mr. Alexander no male NA 1st
## 60 Cassebeer, Mrs. Henry Arthur Jr yes female NA 1st
## 70 Chibnall, Mrs. (Edith Martha Bo yes female NA 1st
## 71 Chisholm, Mr. Roderick Robert C no male NA 1st
## 75 Clifford, Mr. George Quincy no male NA 1st
## 81 Crafton, Mr. John Bertram no male NA 1st
## 107 Farthing, Mr. John no male NA 1st
## 108 Flegenheim, Mrs. Alfred (Antoin yes female NA 1st
## 109 Fleming, Miss. Margaret yes female NA 1st
## 119 Franklin, Mr. Thomas Parham no male NA 1st
## 122 Frauenthal, Mrs. Henry William yes female NA 1st
## 126 Fry, Mr. Richard no male NA 1st
## 135 Goldenberg, Mrs. Samuel L (Edwi yes female NA 1st
## 148 Harrington, Mr. Charles H no male NA 1st
## 153 Hawksford, Mr. Walter James yes male NA 1st
## 158 Hilliard, Mr. Herbert Henry no male NA 1st
## 167 Hoyt, Mr. William Fisher no male NA 1st
## [ reached getOption("max.print") -- omitted 243 rows ]
sum(!complete.cases(subset))
## [1] 263
Drop rows with NA values
subset <- na.omit(subset)
sum(!complete.cases(subset))
## [1] 0
1309-263
## [1] 1046
str(subset)
## 'data.frame': 1046 obs. of 5 variables:
## $ names : Factor w/ 1309 levels "Abbing, Mr. Anthony",..: 22 24 25 26 27 31 46 47 51 55 ...
## $ survived : Factor w/ 2 levels "no","yes": 2 2 1 1 1 2 2 1 2 1 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ age : num 29 0.917 2 30 25 ...
## $ passengerClass: Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:263] 16 38 41 47 60 70 71 75 81 107 ...
## .. ..- attr(*, "names")= chr [1:263] "16" "38" "41" "47" ...
The filter() function is used to extract subsets of rows from a data frame
There are some rows in the “age” column that contain entries less than or equal to zero. These may not be data entry errors (could represent the ages of babies in months) however, let’s isolate and drop values less than 1 for the purposes of demonstration:
attach(subset)
summary(age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1667 21.0000 28.0000 29.8800 39.0000 80.0000
filtered <- filter(subset, age<1)
filtered
## names survived sex age passengerClass
## 1 Allison, Master. Hudson Trevor yes male 0.9167 1st
## 2 Caldwell, Master. Alden Gates yes male 0.8333 2nd
## 3 Hamalainen, Master. Viljo yes male 0.6667 2nd
## 4 Richards, Master. George Sibley yes male 0.8333 2nd
## 5 West, Miss. Barbara J yes female 0.9167 2nd
## 6 Aks, Master. Philip Frank yes male 0.8333 3rd
## 7 Baclini, Miss. Eugenie yes female 0.7500 3rd
## 8 Baclini, Miss. Helene Barbara yes female 0.7500 3rd
## 9 Danbom, Master. Gilbert Sigvard no male 0.3333 3rd
## 10 Dean, Miss. Elizabeth Gladys M yes female 0.1667 3rd
## 11 Peacock, Master. Alfred Edward no male 0.7500 3rd
## 12 Thomas, Master. Assad Alexander yes male 0.4167 3rd
Based on what we find using the Filter function, we can delete rows that meet a certain criteria. There are several ways to do this. A few examples are provided below:
Side note: changing the data frame variable name “subset” to “data” since subset is a built-in function. Subset is actually the logical expression used to indicate which elements or rows to keep based on a condition.
df <- subset
str(df)
## 'data.frame': 1046 obs. of 5 variables:
## $ names : Factor w/ 1309 levels "Abbing, Mr. Anthony",..: 22 24 25 26 27 31 46 47 51 55 ...
## $ survived : Factor w/ 2 levels "no","yes": 2 2 1 1 1 2 2 1 2 1 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ age : num 29 0.917 2 30 25 ...
## $ passengerClass: Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:263] 16 38 41 47 60 70 71 75 81 107 ...
## .. ..- attr(*, "names")= chr [1:263] "16" "38" "41" "47" ...
finaldata <- subset(df, age>=1.0)
summary(finaldata)
## names survived sex
## Abbing, Mr. Anthony : 1 no :617 female:384
## Abbott, Master. Eugene Joseph : 1 yes:417 male :650
## Abbott, Mr. Rossmore Edward : 1
## Abbott, Mrs. Stanton (Rosa Hunt: 1
## Abelseth, Miss. Karen Marie : 1
## Abelseth, Mr. Olaus Jorgensen : 1
## (Other) :1028
## age passengerClass
## Min. : 1.00 1st:283
## 1st Qu.:21.00 2nd:257
## Median :28.00 3rd:494
## Mean :30.22
## 3rd Qu.:39.00
## Max. :80.00
##
The Arrange function allows you to order the rows by one or more columns in ascending or descending order:
head(arrange(finaldata, desc(passengerClass)))
## names survived sex age passengerClass
## 1 Abbing, Mr. Anthony no male 42 3rd
## 2 Abbott, Master. Eugene Joseph no male 13 3rd
## 3 Abbott, Mr. Rossmore Edward no male 16 3rd
## 4 Abbott, Mrs. Stanton (Rosa Hunt yes female 35 3rd
## 5 Abelseth, Miss. Karen Marie yes female 16 3rd
## 6 Abelseth, Mr. Olaus Jorgensen yes male 25 3rd
head(arrange(finaldata, desc(names)))
## names survived sex age passengerClass
## 1 Zimmerman, Mr. Leo no male 29.0 3rd
## 2 Zakarian, Mr. Ortin no male 27.0 3rd
## 3 Zakarian, Mr. Mapriededer no male 26.5 3rd
## 4 Zabour, Miss. Hileni no female 14.5 3rd
## 5 Yrois, Miss. Henriette (Mrs Ha no female 24.0 2nd
## 6 Youseff, Mr. Gerious no male 45.5 3rd
head(arrange(finaldata, age))
## names survived sex age passengerClass
## 1 Becker, Master. Richard F yes male 1 2nd
## 2 Laroche, Miss. Louise yes female 1 2nd
## 3 Mallet, Master. Andre yes male 1 2nd
## 4 Dean, Master. Bertram Vere yes male 1 3rd
## 5 Goodwin, Master. Sidney Leonard no male 1 3rd
## 6 Johnson, Miss. Eleanor Ileen yes female 1 3rd
The mutate() function exists to compute transformations of variables in a data frame. Often, you want to create new variables that are derived from existing variables and mutate() provides a clean interface for doing that.
finaldata <- mutate(finaldata, survive.numeric = ifelse(survived == "yes", 1, 0))
head(finaldata)
## names survived sex age passengerClass
## 1 Allen, Miss. Elisabeth Walton yes female 29 1st
## 2 Allison, Miss. Helen Loraine no female 2 1st
## 3 Allison, Mr. Hudson Joshua Crei no male 30 1st
## 4 Allison, Mrs. Hudson J C (Bessi no female 25 1st
## 5 Anderson, Mr. Harry yes male 48 1st
## 6 Andrews, Miss. Kornelia Theodos yes female 63 1st
## survive.numeric
## 1 1
## 2 0
## 3 0
## 4 0
## 5 1
## 6 1
Finally, summarise() lets you calculate summary statistics. On its own summarise() isn’t that useful, but when combined with group_by() you can summarise by chunks of data.
class <- group_by(finaldata, passengerClass)
head(class)
## Source: local data frame [6 x 6]
## Groups: passengerClass
##
## names survived sex age passengerClass
## 1 Allen, Miss. Elisabeth Walton yes female 29 1st
## 2 Allison, Miss. Helen Loraine no female 2 1st
## 3 Allison, Mr. Hudson Joshua Crei no male 30 1st
## 4 Allison, Mrs. Hudson J C (Bessi no female 25 1st
## 5 Anderson, Mr. Harry yes male 48 1st
## 6 Andrews, Miss. Kornelia Theodos yes female 63 1st
## Variables not shown: survive.numeric (dbl)
summarize(class, sum(survive.numeric))
## Source: local data frame [3 x 2]
##
## passengerClass sum(survive.numeric)
## 1 1st 180
## 2 2nd 111
## 3 3rd 126
The pipe below does the following:
library(babynames)
library(magrittr)
library(ggplot2)
babynames %>%
filter(name %>% substr(1, 3) %>% equals("Mar")) %>%
group_by(year, sex) %>%
summarize(total = sum(n)) %>%
qplot(year, total, color = sex, data = ., geom = "line") %>%
add(ggtitle('Names starting with "Mar"')) %>%
print