Packages used…
library(dplyr)
library(plyr)
library(tidyr)
library(stringr)
Review how the distribution of wealth has changed for the three groups listed over time.
#Import the data, skipping header and malformed columns
wealth<-read.table('https://raw.githubusercontent.com/RobertSellers/R/master/data/wealth.txt', header = FALSE, stringsAsFactors = FALSE,skip = 2,sep="\t")
#convert to local data frame
wealth <- tbl_df(wealth)
#Rename the columns
colnames(wealth)<-c("Year","Top_1_Percent","Next_19_percent","Bottom_80_percent")
head(wealth)
## Source: local data frame [6 x 4]
##
## Year Top_1_Percent Next_19_percent Bottom_80_percent
## (int) (chr) (chr) (chr)
## 1 1983 42.9% 48.4% 8.7%
## 2 1989 46.9% 46.5% 6.6%
## 3 1992 45.6% 46.7% 7.7%
## 4 1995 47.2% 45.9% 7.0%
## 5 1998 47.3% 43.6% 9.1%
## 6 2001 39.7% 51.5% 8.7%
#Using dplyr piping to convert to numeric percentage
wealthtidied<-wealth %>%
mutate(Top_1_Percent=as.numeric(sub("%","",Top_1_Percent))/100) %>%
mutate(Next_19_percent=as.numeric(sub("%","",Next_19_percent))/100) %>%
mutate(Bottom_80_percent=as.numeric(sub("%","",Bottom_80_percent))/100)
#Plotting with percents
#Discvered percentage axis label solution @
#http://stackoverflow.com/questions/11674637/plot-percentages-on-y-axis
percentages <- runif(10)
plot(wealthtidied$Year,wealthtidied$Top_1_Percent, ylim=c(0,1),main="Financial Wealth Distribution Trends 1983-2010",ylab="Percentage",xlab="Year",type="l", col="red",yaxt="n")
axis(2, at=pretty(percentages), lab=pretty(percentages) * 100, las=TRUE)
lines(wealthtidied$Year,wealthtidied$Next_19_percent, col="purple")
lines(wealthtidied$Year,wealthtidied$Bottom_80_percent, col="blue")
legend('topright', names(wealthtidied)[-1] ,
lty=1, col=c('red', 'purple', 'blue'), bty='n', cex=.75)
grid(nx=NULL, ny=NULL,col="black")
Though not any kind of deep statistical analysis, one could tidy the data and then find out the teams with the most World Series appearances, those with the least, who has won the most Series games, won the least, what team has the highest percentage of series wins per World Series appearances, ratio of 4-5-6 and 7- game series (a few were more), etc.
#Import the data as csv
worldseries<- read.csv(file="https://raw.githubusercontent.com/RobertSellers/R/master/data/worldseries.csv", header=TRUE,sep=",")
#convert to local data frame
worldseries <- tbl_df(worldseries)
#Use glimpse to preview the data
glimpse(worldseries)
## Observations: 124
## Variables: 3
## $ Year (fctr) 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007,...
## $ Results (fctr) Royals 4, Mets 1, Giants 4, Royals 3, Red Sox 4, Card...
## $ MVP (fctr) Salvador Perez, Madison Bumgarner, David Ortiz, Pablo...
#Using dplyr piping to modify the data
wstidied<-worldseries %>%
filter(!(Year=="Year")) %>% #remove rows with redundant column information
filter(!(Results=="Not Held")) %>% #remove years without world series results
filter(!(Year=="")) %>%
mutate(Results = str_replace(Results,"Chicago", "Chi.")) %>% #combine teams with multiple values
mutate(Results = str_replace(Results,"Boston Red Sox", "Boston")) %>% #combine teams with multiple values
mutate(Results = str_replace(Results,"St. Louis Cardinals", "St. Louis")) %>% #combine teams with multiple values
mutate(Results = gsub("\\([^\\]]*\\)", " ", Results, perl=TRUE)) %>% #remove tie character information
mutate(Results= str_trim(Results)) %>% #remove tie character information
filter(!is.na(Year)) %>%
separate(Results,c("Winner","Loser"),sep="\\,",remove=FALSE) %>% #splitting the Results
separate(Winner,c("WinningTeam","GamesWon1"), sep=' (?=[^ ]+$)',remove=FALSE) %>% #splitting the Results
separate(Loser,c("LosingTeam","GamesWon2"), sep=' (?=[^ ]+$)',remove=FALSE) %>% #splitting the Results
mutate(totalgames=(as.numeric(GamesWon1)+as.numeric(GamesWon2)))
head(wstidied)
## Source: local data frame [6 x 10]
##
## Year Results Winner WinningTeam GamesWon1
## (fctr) (chr) (chr) (chr) (chr)
## 1 2015 Royals 4, Mets 1 Royals 4 Royals 4
## 2 2014 Giants 4, Royals 3 Giants 4 Giants 4
## 3 2013 Red Sox 4, Cardinals 2 Red Sox 4 Red Sox 4
## 4 2012 Giants 4, Tigers 0 Giants 4 Giants 4
## 5 2011 Cardinals 4, Rangers 3 Cardinals 4 Cardinals 4
## 6 2010 Giants 4, Rangers 1 Giants 4 Giants 4
## Variables not shown: Loser (chr), LosingTeam (chr), GamesWon2 (chr), MVP
## (fctr), totalgames (dbl)
#Creating a frequency table for the winning teams
winners<-wstidied %>%
count('WinningTeam') %>%
rename(c('WinningTeam'='team')) %>%
arrange(freq)
tail(winners)
## team freq
## 27 NY Giants 5
## 28 Philadelphia A's 5
## 29 Pittsburgh 5
## 30 Boston 7
## 31 St. Louis 10
## 32 NY Yankees 26
#Creating a frequency table for the losing teams
losers<-wstidied %>%
count('LosingTeam') %>%
mutate(LosingTeam= str_trim(LosingTeam)) %>%
rename(c('LosingTeam'='team')) %>%
arrange(freq)
#Plotting the Win rankings
colfunc<-colorRampPalette(c("red","yellow","springgreen","royalblue"))
colnamesbarplot <- as.character(winners$team)
midpoints<-barplot(winners$freq,main="Teams ranked by world series wins",xlab="Number of Wins",horiz=TRUE,col=(colfunc(50)),axes=TRUE, names.arg=colnamesbarplot, cex.names=0.5, las=1, xlim=c(0,30))
grid(nx=NULL, ny=NA,col="black")
box()
#Calculating the winning percentages
winningpercentage<-merge(winners, losers, by="team") %>%
mutate(percentwin=(freq.x/(freq.x+freq.y))) %>%
arrange(percentwin)
#Plotting the winning percentages
colnamesbarplot2 <- as.character(winningpercentage$team)
barplot(winningpercentage$percentwin,main="Teams ranked by winning percentage",xlab="Percent of Games Won",horiz=TRUE,col=(colfunc(50)),axes=TRUE, names.arg=colnamesbarplot2, cex.names=0.5, las=1, xlim=c(0,1))
grid(nx=NULL, ny=NA,col="black")
box()
#Plotting the frequency results
barplot(table(wstidied$totalgames),main="Total number of games played",xlab="Number of Games",ylab="Frequency", ylim=c(0,40),col=(colfunc(10)))
box()
Group by name and RTTYP, and provide rankings for lengths.
It should be noted that the vector lengths are “doubled” for limited-access, parallel highways.
#Import the data as csv
tigerlines<- read.csv(file="https://raw.githubusercontent.com/RobertSellers/R/master/data/tiger_file_modified.txt", sep=",", header=TRUE)
#convert to local data frame
tigerlines <- tbl_df(tigerlines)
#Create and apply the RTTYP lookup table
RTTYP<-c("C" = "County", "I" = "Interstate", "M" = "Common Name", "O" = "Other", "S" = "State recognized", "U" = "U.S.")
tigerlines$RTTYP <- RTTYP[tigerlines$RTTYP]
#Use dplyr pipes to create grouped tables
tigerjoin1<-tigerlines %>%
select(LINEARID,RTTYP,FULLNAME, miles_length) %>%
group_by(RTTYP,FULLNAME) %>%
summarise_each(funs(sum(., na.rm=TRUE)),-LINEARID)%>%
arrange(desc(miles_length))
tigerjoin2<-tigerlines %>%
select(LINEARID,RTTYP, miles_length) %>%
group_by(RTTYP) %>%
summarise_each(funs(sum(., na.rm=TRUE)),-LINEARID)%>%
arrange(desc(miles_length))
#Create more subsets for the top results per RTTYP
topCommon<-tigerjoin1 %>%
filter(RTTYP =="Common Name")%>%
arrange(desc(miles_length))%>%
top_n(10)%>%
arrange(miles_length)
head(topCommon)
## Source: local data frame [6 x 3]
## Groups: RTTYP [1]
##
## RTTYP FULLNAME miles_length
## (chr) (fctr) (dbl)
## 1 Common Name Mojave Fwy 340.4389
## 2 Common Name Pacific Hwy 400.6257
## 3 Common Name Old Oregon Trail Hwy 419.4345
## 4 Common Name Southern Tier Expy 435.1429
## 5 Common Name Kansas Tpke 467.3304
## 6 Common Name Ohio Tpke 479.6728
topUS<-tigerjoin1 %>%
filter(RTTYP =="U.S.")%>%
arrange(desc(miles_length))%>%
top_n(10)%>%
arrange(miles_length)
topInterstate<-tigerjoin1 %>%
filter(RTTYP =="Interstate")%>%
arrange(desc(miles_length))%>%
top_n(10)%>%
arrange(miles_length)
#Plotting the results
colnamesbarplotCommon <- as.character(topCommon$FULLNAME)
barplot(topCommon$miles_length,main="Top 10 Longest Common Name Highways",xlab="Vector Length (in miles)",horiz=TRUE,col=(colfunc(20)),axes=TRUE, names.arg=colnamesbarplotCommon, cex.names=0.5, las=1, xlim=c(0,800))
grid(nx=NULL, ny=NA,col="black")
box()
colnamesbarplotUS <- as.character(topUS$FULLNAME)
barplot(topUS$miles_length,main="Top 10 Longest U.S. Highways",xlab="Vector Length (in miles)",horiz=TRUE,col=(colfunc(20)),axes=TRUE, names.arg=colnamesbarplotUS, cex.names=0.5, las=1, xlim=c(0,2000))
grid(nx=NULL, ny=NA,col="black")
box()
colnamesbarplotInterstate <- as.character(topInterstate$FULLNAME)
barplot(topInterstate$miles_length,main="Top 10 Longest Interstate Highways",xlab="Vector Length (in miles)",horiz=TRUE,col=(colfunc(20)),axes=TRUE, names.arg=colnamesbarplotInterstate, cex.names=0.5, las=1, xlim=c(0,7000))
grid(nx=NULL, ny=NA,col="black")
box()