Project Two with dplyr and tidyr

Packages used…

library(dplyr)
library(plyr)
library(tidyr)
library(stringr)

Example One from Ken Markus : Distribution of wealth in the US

Review how the distribution of wealth has changed for the three groups listed over time.

#Import the data, skipping header and malformed columns
wealth<-read.table('https://raw.githubusercontent.com/RobertSellers/R/master/data/wealth.txt', header = FALSE,  stringsAsFactors = FALSE,skip = 2,sep="\t")

#convert to local data frame
wealth <- tbl_df(wealth)

#Rename the columns
colnames(wealth)<-c("Year","Top_1_Percent","Next_19_percent","Bottom_80_percent")
head(wealth)
## Source: local data frame [6 x 4]
## 
##    Year Top_1_Percent Next_19_percent Bottom_80_percent
##   (int)         (chr)           (chr)             (chr)
## 1  1983         42.9%           48.4%              8.7%
## 2  1989         46.9%           46.5%              6.6%
## 3  1992         45.6%           46.7%              7.7%
## 4  1995         47.2%           45.9%              7.0%
## 5  1998         47.3%           43.6%              9.1%
## 6  2001         39.7%           51.5%              8.7%
#Using dplyr piping to convert to numeric percentage
wealthtidied<-wealth %>%
  mutate(Top_1_Percent=as.numeric(sub("%","",Top_1_Percent))/100) %>%
  mutate(Next_19_percent=as.numeric(sub("%","",Next_19_percent))/100) %>%
  mutate(Bottom_80_percent=as.numeric(sub("%","",Bottom_80_percent))/100)

#Plotting with percents
#Discvered percentage axis label solution @
#http://stackoverflow.com/questions/11674637/plot-percentages-on-y-axis
percentages <- runif(10)

plot(wealthtidied$Year,wealthtidied$Top_1_Percent, ylim=c(0,1),main="Financial Wealth Distribution Trends 1983-2010",ylab="Percentage",xlab="Year",type="l", col="red",yaxt="n")
axis(2, at=pretty(percentages), lab=pretty(percentages) * 100, las=TRUE)
lines(wealthtidied$Year,wealthtidied$Next_19_percent, col="purple")
lines(wealthtidied$Year,wealthtidied$Bottom_80_percent, col="blue")
legend('topright', names(wealthtidied)[-1] , 
lty=1, col=c('red', 'purple', 'blue'), bty='n', cex=.75)
grid(nx=NULL, ny=NULL,col="black")

Example Two from Logan Thomson : World Series Data

Though not any kind of deep statistical analysis, one could tidy the data and then find out the teams with the most World Series appearances, those with the least, who has won the most Series games, won the least, what team has the highest percentage of series wins per World Series appearances, ratio of 4-5-6 and 7- game series (a few were more), etc.

#Import the data as csv
worldseries<- read.csv(file="https://raw.githubusercontent.com/RobertSellers/R/master/data/worldseries.csv", header=TRUE,sep=",")

#convert to local data frame
worldseries <- tbl_df(worldseries)

#Use glimpse to preview the data
glimpse(worldseries)
## Observations: 124
## Variables: 3
## $ Year    (fctr) 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007,...
## $ Results (fctr) Royals 4, Mets 1, Giants 4, Royals 3, Red Sox 4, Card...
## $ MVP     (fctr) Salvador Perez, Madison Bumgarner, David Ortiz, Pablo...
#Using dplyr piping to modify the data
wstidied<-worldseries  %>% 
  filter(!(Year=="Year")) %>% #remove rows with redundant column information
  filter(!(Results=="Not Held")) %>% #remove years without world series results
  filter(!(Year=="")) %>%
  mutate(Results = str_replace(Results,"Chicago", "Chi.")) %>% #combine teams with multiple values
  mutate(Results = str_replace(Results,"Boston Red Sox", "Boston")) %>% #combine teams with multiple values
  mutate(Results = str_replace(Results,"St. Louis Cardinals", "St. Louis")) %>% #combine teams with multiple values
  mutate(Results = gsub("\\([^\\]]*\\)", " ", Results, perl=TRUE)) %>% #remove tie character information
  mutate(Results= str_trim(Results)) %>% #remove tie character information
  filter(!is.na(Year)) %>% 
  separate(Results,c("Winner","Loser"),sep="\\,",remove=FALSE) %>% #splitting the Results
  separate(Winner,c("WinningTeam","GamesWon1"), sep=' (?=[^ ]+$)',remove=FALSE) %>% #splitting the Results
  separate(Loser,c("LosingTeam","GamesWon2"), sep=' (?=[^ ]+$)',remove=FALSE) %>% #splitting the Results
  mutate(totalgames=(as.numeric(GamesWon1)+as.numeric(GamesWon2))) 
 
head(wstidied)
## Source: local data frame [6 x 10]
## 
##     Year                Results      Winner WinningTeam GamesWon1
##   (fctr)                  (chr)       (chr)       (chr)     (chr)
## 1   2015       Royals 4, Mets 1    Royals 4      Royals         4
## 2   2014     Giants 4, Royals 3    Giants 4      Giants         4
## 3   2013 Red Sox 4, Cardinals 2   Red Sox 4     Red Sox         4
## 4   2012     Giants 4, Tigers 0    Giants 4      Giants         4
## 5   2011 Cardinals 4, Rangers 3 Cardinals 4   Cardinals         4
## 6   2010    Giants 4, Rangers 1    Giants 4      Giants         4
## Variables not shown: Loser (chr), LosingTeam (chr), GamesWon2 (chr), MVP
##   (fctr), totalgames (dbl)
#Creating a frequency table for the winning teams
winners<-wstidied  %>%
  count('WinningTeam') %>%
  rename(c('WinningTeam'='team')) %>%
  arrange(freq)
 
tail(winners)
##                team freq
## 27        NY Giants    5
## 28 Philadelphia A's    5
## 29       Pittsburgh    5
## 30           Boston    7
## 31        St. Louis   10
## 32       NY Yankees   26
#Creating a frequency table for the losing teams
losers<-wstidied  %>%
  count('LosingTeam') %>%
  mutate(LosingTeam= str_trim(LosingTeam)) %>%
  rename(c('LosingTeam'='team')) %>%
  arrange(freq)
 
#Plotting the Win rankings 
colfunc<-colorRampPalette(c("red","yellow","springgreen","royalblue"))
colnamesbarplot <- as.character(winners$team)
midpoints<-barplot(winners$freq,main="Teams ranked by world series wins",xlab="Number of Wins",horiz=TRUE,col=(colfunc(50)),axes=TRUE, names.arg=colnamesbarplot, cex.names=0.5, las=1, xlim=c(0,30))
grid(nx=NULL, ny=NA,col="black")
box()

#Calculating the winning percentages
winningpercentage<-merge(winners, losers, by="team") %>%
  mutate(percentwin=(freq.x/(freq.x+freq.y))) %>%
  arrange(percentwin)

#Plotting the winning percentages
colnamesbarplot2 <- as.character(winningpercentage$team)
barplot(winningpercentage$percentwin,main="Teams ranked by winning percentage",xlab="Percent of Games Won",horiz=TRUE,col=(colfunc(50)),axes=TRUE, names.arg=colnamesbarplot2, cex.names=0.5, las=1, xlim=c(0,1))
grid(nx=NULL, ny=NA,col="black")
box()

#Plotting the frequency results
barplot(table(wstidied$totalgames),main="Total number of games played",xlab="Number of Games",ylab="Frequency", ylim=c(0,40),col=(colfunc(10)))
box()

Example Three from Myself : Tigerline shapefiles of the United States

Group by name and RTTYP, and provide rankings for lengths.

It should be noted that the vector lengths are “doubled” for limited-access, parallel highways.

#Import the data as csv
tigerlines<- read.csv(file="https://raw.githubusercontent.com/RobertSellers/R/master/data/tiger_file_modified.txt", sep=",", header=TRUE)

#convert to local data frame
tigerlines <- tbl_df(tigerlines)

#Create and apply the RTTYP lookup table
RTTYP<-c("C" = "County", "I" = "Interstate", "M" = "Common Name", "O" = "Other", "S" = "State recognized", "U" = "U.S.")
tigerlines$RTTYP <- RTTYP[tigerlines$RTTYP]

#Use dplyr pipes to create grouped tables
tigerjoin1<-tigerlines %>%
   select(LINEARID,RTTYP,FULLNAME, miles_length) %>%
   group_by(RTTYP,FULLNAME) %>% 
   summarise_each(funs(sum(., na.rm=TRUE)),-LINEARID)%>% 
   arrange(desc(miles_length))

tigerjoin2<-tigerlines %>%
   select(LINEARID,RTTYP, miles_length) %>%
   group_by(RTTYP) %>% 
   summarise_each(funs(sum(., na.rm=TRUE)),-LINEARID)%>% 
   arrange(desc(miles_length))
 
#Create more subsets for the top results per RTTYP
topCommon<-tigerjoin1 %>%
  filter(RTTYP =="Common Name")%>%
  arrange(desc(miles_length))%>%
  top_n(10)%>%
  arrange(miles_length)
head(topCommon)
## Source: local data frame [6 x 3]
## Groups: RTTYP [1]
## 
##         RTTYP             FULLNAME miles_length
##         (chr)               (fctr)        (dbl)
## 1 Common Name           Mojave Fwy     340.4389
## 2 Common Name          Pacific Hwy     400.6257
## 3 Common Name Old Oregon Trail Hwy     419.4345
## 4 Common Name   Southern Tier Expy     435.1429
## 5 Common Name          Kansas Tpke     467.3304
## 6 Common Name            Ohio Tpke     479.6728
topUS<-tigerjoin1 %>%
  filter(RTTYP =="U.S.")%>%
  arrange(desc(miles_length))%>%
  top_n(10)%>%
  arrange(miles_length)

topInterstate<-tigerjoin1 %>%
  filter(RTTYP =="Interstate")%>%
  arrange(desc(miles_length))%>%
  top_n(10)%>%
  arrange(miles_length)
#Plotting the results
colnamesbarplotCommon <- as.character(topCommon$FULLNAME)
  
barplot(topCommon$miles_length,main="Top 10 Longest Common Name Highways",xlab="Vector Length (in miles)",horiz=TRUE,col=(colfunc(20)),axes=TRUE, names.arg=colnamesbarplotCommon, cex.names=0.5, las=1, xlim=c(0,800))
grid(nx=NULL, ny=NA,col="black")
box()

colnamesbarplotUS <- as.character(topUS$FULLNAME)
  
barplot(topUS$miles_length,main="Top 10 Longest U.S. Highways",xlab="Vector Length (in miles)",horiz=TRUE,col=(colfunc(20)),axes=TRUE, names.arg=colnamesbarplotUS, cex.names=0.5, las=1, xlim=c(0,2000))
grid(nx=NULL, ny=NA,col="black")
box()

colnamesbarplotInterstate <- as.character(topInterstate$FULLNAME)
  
barplot(topInterstate$miles_length,main="Top 10 Longest Interstate Highways",xlab="Vector Length (in miles)",horiz=TRUE,col=(colfunc(20)),axes=TRUE, names.arg=colnamesbarplotInterstate, cex.names=0.5, las=1, xlim=c(0,7000))
grid(nx=NULL, ny=NA,col="black")
box()