Part 2 of my additional practice with data clean-up and transformations using tidyr and dplyr. CRAN documentation available for tidyr is available here and dplr documentation can be found here.
library(tidyr)
library(dplyr)
library(stringr)
This chart presents various details regarding living former Presidents of the United States:
# CSV file was created in OpenOffice to quickly match the source chart very closely
# This could also be done using the data.frame() and write.csv() functions.
presidents <- read.csv("presidents.csv")
presidents
## President Term_of_Office DOB
## 1 George H.W. Bush 1989-1993 June 12, 1924 (age 91)
## 2 Jimmy Carter 1977-1981 October 1, 1924 (age 90)
## 3 George W. Bush 2001-2009 July 6, 1946 (age 69)
## 4 Bill Clinton 1993-2001 August 19, 1946 (age 69)
Restructure the dataset
colnames(presidents)
## [1] "President" "Term_of_Office" "DOB"
# Split the Term_of_Office column into two separate columns
presidents$Start_Term <- lapply(strsplit(as.character(presidents$Term_of_Office), "\\-"), "[", 2)
presidents$End_Term <- lapply(strsplit(as.character(presidents$Term_of_Office), "\\-"), "[", 1)
# Drop the old column
presidents <- subset(presidents, select = c(President,DOB,Start_Term,End_Term))
presidents
## President DOB Start_Term End_Term
## 1 George H.W. Bush June 12, 1924 (age 91) 1993 1989
## 2 Jimmy Carter October 1, 1924 (age 90) 1981 1977
## 3 George W. Bush July 6, 1946 (age 69) 2009 2001
## 4 Bill Clinton August 19, 1946 (age 69) 2001 1993
# Pull the presidents' age out of the DOB column and create a new Age column
presidents$Age <- lapply(strsplit(as.character(presidents$DOB), "\\("), "[", 2)
ages <- unlist(str_extract_all(presidents$Age, "[[:digit:]]{1,}"))
presidents$Age <- ages
presidents
## President DOB Start_Term End_Term Age
## 1 George H.W. Bush June 12, 1924 (age 91) 1993 1989 91
## 2 Jimmy Carter October 1, 1924 (age 90) 1981 1977 90
## 3 George W. Bush July 6, 1946 (age 69) 2009 2001 69
## 4 Bill Clinton August 19, 1946 (age 69) 2001 1993 69
# Drop old DOB column as we will not be using it from here forward
presidents <- subset(presidents, select = c(President,Start_Term,End_Term,Age))
presidents
## President Start_Term End_Term Age
## 1 George H.W. Bush 1993 1989 91
## 2 Jimmy Carter 1981 1977 90
## 3 George W. Bush 2009 2001 69
## 4 Bill Clinton 2001 1993 69
# Give a list of presidents who had one term in office (4 years) and presidents who had two terms in office (8 years)
# Convert term columns to numeric values
presidents$Start_Term <- as.numeric(presidents$Start_Term)
presidents$End_Term <- as.numeric(presidents$End_Term)
# Add a column to indicate number of years in office (Just to make final result more clear)
presidents$Yrs_Office <- abs(presidents$Start_Term-presidents$End_Term)
# Create a table of presidents with two terms
terms_2 <- presidents %>% filter(abs(presidents$Start_Term-presidents$End_Term) > 4)
terms_2
## President Start_Term End_Term Age Yrs_Office
## 1 George W. Bush 2009 2001 69 8
## 2 Bill Clinton 2001 1993 69 8
# Create a table of presidents with one term
terms_1 <- presidents %>% filter(abs(presidents$Start_Term-presidents$End_Term) < 5)
terms_1
## President Start_Term End_Term Age Yrs_Office
## 1 George H.W. Bush 1993 1989 91 4
## 2 Jimmy Carter 1981 1977 90 4
# Which president is the oldest?
# Convert age column to numeric
presidents$Age <- as.numeric(presidents$Age)
max_age <- presidents %>% slice(which.max(Age))
max_age
## President Start_Term End_Term Age Yrs_Office
## 1 George H.W. Bush 1993 1989 91 4
# Find the average age of all presidents
av_age_pres <- mean(presidents$Age)
av_age_pres
## [1] 79.75