Loading of libraries and initial cleanup work
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
crime <- read.csv("https://raw.githubusercontent.com/bkreis84/Project-2/master/table_12_crime_trends_by_population_group_2012-2013.csv", header = TRUE, skip = 3, check.names = FALSE, stringsAsFactors = FALSE)
colnames(crime)[2] <- "Year"
crime <- crime[1:42,1:14]
tbl_df(crime)
## Source: local data frame [42 x 14]
##
## Population group Year Violent\n crime
## (chr) (chr) (chr)
## 1 TOTAL ALL AGENCIES: 2012 1,145,272
## 2 2013 1,095,149
## 3 Percent change -4.4
## 4 TOTAL CITIES 2012 919,218
## 5 2013 877,594
## 6 Percent change -4.5
## 7 GROUP I (250,000 and over) 2012 416,885
## 8 2013 402,988
## 9 Percent change -3.3
## 10 1,000,000 and over (Group I subset) 2012 166,007
## .. ... ... ...
## Variables not shown: Murder and nonnegligent manslaughter (chr), Forcible
## rape1 (chr), Robbery (chr), Aggravated assault (chr), Property crime
## (chr), Burglary (chr), Larceny- theft (chr), Motor vehicle theft (chr),
## Arson (chr), Number of agencies (chr), 2013 estimated population (chr)
We do not care about the percent change in our analysis so we can remove this using a logical vector.
crimex <- crime[c(TRUE, TRUE, FALSE),]
We now replace blanks with NA values so that we can then use the fill function to fill in missing values using the value above.
crimex[crimex==""] <- NA
crimex <- crimex %>%
fill(`Population group`)
After noticing some errors (likely spaces on the end of character strings), we change our variable names.
We then select the specific variables we want using the “select” function. We then use the arrange function to sort our data in a way that we can once again use the fill function to fill in missing values with values from above.
colnames(crimex)[3] <- "Violent crime"
colnames(crimex)[8] <- "Property crime"
colnames(crimex)[14] <- "2013 estimated population"
crimex <- select(crimex, Year,`Population group`, `Violent crime`, `Property crime`, `2013 estimated population`)
crimex <- arrange(crimex, `Population group`, desc(Year))
crimex <- crimex %>%
fill(`2013 estimated population`)
I found a function to trim data using regular expressions. I hope to learn more about functions in the coming weeks. http://r.789695.n4.nabble.com/Remove-space-from-string-td4292042.html
Once again remove data not pertinant to our analysis.
trim <- function(x) gsub(",|^[[:space:]]+|[[:space:]]+$", "", x)
crimex <- data.frame(sapply(crimex, trim), check.names = FALSE)
#realizing that I need to dive into functions more
crimex <- crimex[-19:-28,]
crimex <- crimex[-1:-6,]
Change our character values to numeric now that they have been trimmed. This step took me quite awhile because every time I tried to convert the values to numeric, they were filled in with NA values. This is why I needed the trim function above (there were characters and spaces preventing the conversion)
crimex$`Violent crime` <- as.numeric(as.character(crimex$`Violent crime`))
crimex$`Property crime` <- as.numeric(as.character(crimex$`Property crime`))
crimex$`Population group` <- as.character(crimex$`Population group`)
crimex$`2013 estimated population` <- as.numeric(as.character(crimex$`2013 estimated population`))
vcrime <- select(crimex, -`Property crime`)
pcrime <- select(crimex, -`Violent crime`)
I spread the data in order to get the year as a variable (I later determined that this did not really tidy the data). I also created new variables which showed the rate of violent crime per 100,000 population.
vcrime <- spread(vcrime, Year, `Violent crime`)
vcrime <- mutate(vcrime, `2012 Violence Rate` = vcrime$`2012` / (vcrime$`2013 estimated population` / 100000))
vcrime <- mutate(vcrime, `2013 Violence Rate` = vcrime$`2013` / (vcrime$`2013 estimated population` / 100000))
# Realized that the crimex set is actually the tidy one, so used that for the graph.
crimex <- mutate(crimex, `Violence Rate` = crimex$`Violent crime` / (crimex$`2013 estimated population` / 100000))
crimex$`Violence Rate` <- as.integer(crimex$`Violence Rate`)
tbl_df(crimex)
## Source: local data frame [12 x 6]
##
## Year Population group Violent crime Property crime
## (fctr) (chr) (dbl) (dbl)
## 1 2013 GROUP I (250000 and over) 402988 2097875
## 2 2012 GROUP I (250000 and over) 416885 2155161
## 3 2013 GROUP II (100000 to 249999) 145408 1130942
## 4 2012 GROUP II (100000 to 249999) 154144 1167627
## 5 2013 GROUP III (50000 to 99999) 110033 942519
## 6 2012 GROUP III (50000 to 99999) 116180 977397
## 7 2013 GROUP IV (25000 to 49999) 80441 806275
## 8 2012 GROUP IV (25000 to 49999) 85900 836125
## 9 2013 GROUP V (10000 to 24999) 73998 778955
## 10 2012 GROUP V (10000 to 24999) 77759 817630
## 11 2013 GROUP VI (under 10000) 64726 678313
## 12 2012 GROUP VI (under 10000) 68350 717869
## Variables not shown: 2013 estimated population (dbl), Violence Rate (int)
ggplot(crimex, aes(x=Year, y=`Violence Rate`, color=`Population group`, group=`Population group`)) +
geom_point(size=2) +
geom_line(size=2)
This one is relatively straightforward.We begin with pulling in the data and removing unnecessary columns
wins <- read.csv("https://raw.githubusercontent.com/bkreis84/Project-2/master/wins.csv")
wins <- wins[,-5]
wins <- wins[,-2]
We then use the gather function to tidy up or data.
wins <- gather(wins, "Team", "Wins", 2:31)
This graph is relatively useless! Just for practice
ggplot(wins, aes(x=Year, y=Wins, color=Team, group=Team)) +
geom_point(size=2) +
geom_line(size=2)
Again we start with bringing in the data and doing some intial cleanup
giants <- read.csv("https://raw.githubusercontent.com/bkreis84/Project-2/master/Jets%20Giants.csv")
tbl_df(giants)
## Source: local data frame [10 x 10]
##
## Team Stats Year X X.1 X.2 X.3 X.4 X.5
## (fctr) (fctr) (int) (int) (int) (int) (int) (int) (int)
## 1 2007 2008 2009 2010 2011 2012 2013
## 2 Jets Total Passing Yards 3014 3303 2380 3242 3297 2891 2932
## 3 Total Rushing Yards 1701 2004 2756 2374 1692 1896 2158
## 4 Total First Downs 286 308 280 307 301 299 280
## 5 Touchdowns 26 48 37 39 45 31 27
## 6 NA NA NA NA NA NA NA
## 7 Giants Total Passing Yards 3154 3177 4019 3885 4734 3825 3588
## 8 Total Rushing Yards 2148 2518 1837 2200 1427 1862 1332
## 9 Total First Downs 321 338 323 331 331 327 280
## 10 Touchdowns 44 45 46 48 47 47 32
## Variables not shown: X.6 (int)
giants[3:5, "Team"] <- "Jets"
giants[8:10, "Team"] <- "Giants"
c("Team", "Stats", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014") -> colnames(giants)
giants <- giants[-1,]
giants <- giants[-5,]
We use the gather and spread functions to tidy the data and mutate to create a new variable.
giants <- gather(giants, "Year", "n", 3:10)
giants <- spread(giants, "Stats", n)
tbl_df(giants)
## Source: local data frame [16 x 6]
##
## Team Year Total First Downs Total Passing Yards Total Rushing Yards
## (fctr) (fctr) (int) (int) (int)
## 1 Giants 2007 321 3154 2148
## 2 Giants 2008 338 3177 2518
## 3 Giants 2009 323 4019 1837
## 4 Giants 2010 331 3885 2200
## 5 Giants 2011 331 4734 1427
## 6 Giants 2012 327 3825 1862
## 7 Giants 2013 280 3588 1332
## 8 Giants 2014 336 4272 1603
## 9 Jets 2007 286 3014 1701
## 10 Jets 2008 308 3303 2004
## 11 Jets 2009 280 2380 2756
## 12 Jets 2010 307 3242 2374
## 13 Jets 2011 301 3297 1692
## 14 Jets 2012 299 2891 1896
## 15 Jets 2013 280 2932 2158
## 16 Jets 2014 289 2946 2280
## Variables not shown: Touchdowns (int)
giants <- giants %>%
mutate(`Total Yards` = `Total Passing Yards` + `Total Rushing Yards`)
We then use various dplyr functions to obtain information for our analysis
giants1 <- giants %>%
group_by(Team, Year) %>%
summarise (`Passing Ratio` = `Total Passing Yards` / `Total Yards`,
`Rushing Ratio` = `Total Rushing Yards` / `Total Yards`,
`TD/1stDown` = Touchdowns / `Total First Downs`)
tbl_df(giants1)
## Source: local data frame [16 x 5]
##
## Team Year Passing Ratio Rushing Ratio TD/1stDown
## (fctr) (fctr) (dbl) (dbl) (dbl)
## 1 Giants 2007 0.5948699 0.4051301 0.13707165
## 2 Giants 2008 0.5578578 0.4421422 0.13313609
## 3 Giants 2009 0.6863046 0.3136954 0.14241486
## 4 Giants 2010 0.6384552 0.3615448 0.14501511
## 5 Giants 2011 0.7683818 0.2316182 0.14199396
## 6 Giants 2012 0.6725866 0.3274134 0.14373089
## 7 Giants 2013 0.7292683 0.2707317 0.11428571
## 8 Giants 2014 0.7271489 0.2728511 0.13095238
## 9 Jets 2007 0.6392365 0.3607635 0.09090909
## 10 Jets 2008 0.6223855 0.3776145 0.15584416
## 11 Jets 2009 0.4633956 0.5366044 0.13214286
## 12 Jets 2010 0.5772792 0.4227208 0.12703583
## 13 Jets 2011 0.6608539 0.3391461 0.14950166
## 14 Jets 2012 0.6039273 0.3960727 0.10367893
## 15 Jets 2013 0.5760314 0.4239686 0.09642857
## 16 Jets 2014 0.5637199 0.4362801 0.09342561
giants2 <- giants %>%
group_by(Team) %>%
summarise(`Max TD` = max(Touchdowns), `Min TD` = min(Touchdowns),
`Avg Rushing Yards` = mean(`Total Rushing Yards`),
`Avg Passing Yards` = mean(`Total Passing Yards`),
`Avg Total Yards` = mean(`Total Yards`))
tbl_df(giants2)
## Source: local data frame [2 x 6]
##
## Team Max TD Min TD Avg Rushing Yards Avg Passing Yards Avg Total Yards
## (fctr) (int) (int) (dbl) (dbl) (dbl)
## 1 Giants 48 32 1865.875 3831.750 5697.625
## 2 Jets 48 26 2107.625 3000.625 5108.250
giants3 <- select(giants, Year, Team, contains("Yards"))
tbl_df(giants3)
## Source: local data frame [16 x 5]
##
## Year Team Total Passing Yards Total Rushing Yards Total Yards
## (fctr) (fctr) (int) (int) (int)
## 1 2007 Giants 3154 2148 5302
## 2 2008 Giants 3177 2518 5695
## 3 2009 Giants 4019 1837 5856
## 4 2010 Giants 3885 2200 6085
## 5 2011 Giants 4734 1427 6161
## 6 2012 Giants 3825 1862 5687
## 7 2013 Giants 3588 1332 4920
## 8 2014 Giants 4272 1603 5875
## 9 2007 Jets 3014 1701 4715
## 10 2008 Jets 3303 2004 5307
## 11 2009 Jets 2380 2756 5136
## 12 2010 Jets 3242 2374 5616
## 13 2011 Jets 3297 1692 4989
## 14 2012 Jets 2891 1896 4787
## 15 2013 Jets 2932 2158 5090
## 16 2014 Jets 2946 2280 5226
Finally we create a couple charts to show the Total yards of each team over the years and the passing ratio. The information here conclusively proves that The Giants > Jets.
ggplot(giants3, aes(x = Year, y = `Total Yards`, color = Team, group = Team)) +
geom_point(size = 8) +
geom_line(size = 2)
ggplot(giants1, aes(x = Year, y = `Passing Ratio`, color = Team, group = Team)) +
geom_point(size = 8) +
geom_line(size = 2)