IS 607 - Project 2

Wide data set 1 - Source fivethirtyeight

The source https://github.com/fivethirtyeight/data/blob/master/forecast-methodology/historical-senate-predictions.csv).

The data depicts the win/loss probability predictions for the years 2008 to 2012 US Senate candidates by state.

1. Load the required packages:

library(dplyr)
library(tidyr)
library(knitr)
library(ggplot2)

2. After downloding the source file from github, read the CSV file into an R object

senate_pred <- read.csv("https://raw.githubusercontent.com/isrini/SI_IS607/master/historical-senate-predictions.csv")
glimpse(senate_pred)

## Observations: 207
## Variables: 6
## $ state         (fctr) Alabama, Alabama, Alabama, Alabama, Alaska, Ala...
## $ year          (int) 2008, 2008, 2010, 2010, 2008, 2008, 2010, 2010, ...
## $ candidate     (fctr) Sessions, Figures, Shelby, Barnes, Begich, Stev...
## $ forecast_prob (dbl) 1.00, 0.00, 1.00, 0.00, 1.00, 0.00, 0.71, 0.21, ...
## $ result        (fctr) Win, Lose, Win, Loss, Win, Lose, Lose, Win, Los...
## $ winflag       (int) 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, ...

3. Perform data wrangling with tidyr and dplyr packages

The source file has data from the years 2008 to 2012 and does not reflect all the 50 states for each of those years. For the purpose of this study, lets filter the records for the year 2012 Senate race.

senate_2012 <- filter(senate_pred, year == '2012')
senate_2012 <- select(senate_2012, state, candidate, forecast_prob, result, winflag)

head(senate_2012)

##         state candidate forecast_prob result winflag
## 1     Arizona     Flake          0.96    Win       1
## 2     Arizona   Carmona          0.04   Loss       0
## 3  California Feinstein          1.00    Win       1
## 4  California     Emken          0.00   Loss       0
## 5 Connecticut    Murphy          0.96    Win       1
## 6 Connecticut   McMahon          0.04   Loss       0

The data set is now in a wide format that would still need some tidying and formatting before we can analyze the final set. Renaming the data from the columns ‘result’ and ‘winflag’

senate_2012$result <- gsub("Win", "Winner", senate_2012$result)
senate_2012$result <- gsub("Loss", "Loser", senate_2012$result)

senate_2012$winflag <- gsub('1', "win_forecast_prob", senate_2012$winflag)
senate_2012$winflag <- gsub('0', "loss_forecast_prob", senate_2012$winflag)

head(senate_2012)

##         state candidate forecast_prob result            winflag
## 1     Arizona     Flake          0.96 Winner  win_forecast_prob
## 2     Arizona   Carmona          0.04  Loser loss_forecast_prob
## 3  California Feinstein          1.00 Winner  win_forecast_prob
## 4  California     Emken          0.00  Loser loss_forecast_prob
## 5 Connecticut    Murphy          0.96 Winner  win_forecast_prob
## 6 Connecticut   McMahon          0.04  Loser loss_forecast_prob

Now, spread the column ‘result’ in to two columns to show the ‘loser’ and ‘winner’ by name.

senate_2012 <- spread(senate_2012, result, candidate)
head(senate_2012)

##         state forecast_prob            winflag   Loser    Winner
## 1     Arizona          0.04 loss_forecast_prob Carmona      <NA>
## 2     Arizona          0.96  win_forecast_prob    <NA>     Flake
## 3  California          0.00 loss_forecast_prob   Emken      <NA>
## 4  California          1.00  win_forecast_prob    <NA> Feinstein
## 5 Connecticut          0.04 loss_forecast_prob McMahon      <NA>
## 6 Connecticut          0.96  win_forecast_prob    <NA>    Murphy

Do the same with the ‘winflag’ column and show the forecast probability as losss and win probability columns.

senate_2012 <- spread(senate_2012, winflag, forecast_prob)
head(senate_2012)

##         state   Loser    Winner loss_forecast_prob win_forecast_prob
## 1     Arizona Carmona      <NA>               0.04                NA
## 2     Arizona    <NA>     Flake                 NA              0.96
## 3  California   Emken      <NA>               0.00                NA
## 4  California    <NA> Feinstein                 NA              1.00
## 5 Connecticut McMahon      <NA>               0.04                NA
## 6 Connecticut    <NA>    Murphy                 NA              0.96

We have the State row occuring twice to show each candidate and the corresponding data. Using the group by function and summarise, the two State rows have been merged in to one row. And the NA references removed.

senate_2012 <- senate_2012 %>%
      group_by(state) %>%
      summarise_each(funs(first(.[!is.na(.)])))
    
head(senate_2012)

## Source: local data frame [6 x 5]
## 
##         state   Loser    Winner loss_forecast_prob win_forecast_prob
##        (fctr)  (fctr)    (fctr)              (dbl)             (dbl)
## 1     Arizona Carmona     Flake               0.04              0.96
## 2  California   Emken Feinstein               0.00              1.00
## 3 Connecticut McMahon    Murphy               0.04              0.96
## 4    Delaware    Wade    Carper               0.00              1.00
## 5     Florida    Mack    Nelson               0.00              1.00
## 6      Hawaii  Lingle    Hirono               0.00              1.00

Here is the transformed Tidy data set

kable(senate_2012)

state	Loser	Winner	loss_forecast_prob	win_forecast_prob
Arizona	Carmona	Flake	0.04	0.96
California	Emken	Feinstein	0.00	1.00
Connecticut	McMahon	Murphy	0.04	0.96
Delaware	Wade	Carper	0.00	1.00
Florida	Mack	Nelson	0.00	1.00
Hawaii	Lingle	Hirono	0.00	1.00
Indiana	Lingle	Donnelly	0.30	0.70
Maine	Dill	King	0.01	0.93
Maryland	Bongino	Cardin	0.00	1.00
Massachusetts	Brown	Warren	0.06	0.94
Michigan	Hoekstra	Stabenow	0.00	1.00
Minnesota	Bills	Klobuchar	0.00	1.00
Mississippi	Gore	Wicker	0.00	1.00
Missouri	Akin	McCaskill	0.02	0.98
Montana	Tester	Rehberg	0.34	0.66
Nebraska	Kerrey	Fischer	0.01	0.99
Nevada	Berkeley	Heller	0.17	0.83
New Jersey	Kyrillos	Menendez	0.00	1.00
New Mexico	Wilson	Heinrich	0.03	0.97
New York	Long	Gillibrand	0.00	1.00
North Dakota	Berg	Heitkamp	0.92	0.08
Ohio	Mandel	Brown	0.03	0.97
Pennsylvania	Smith	Casey	0.01	0.99
Rhode Island	Hinkley	Whitehouse	0.00	1.00
Tennessee	Clayton	Corker	0.00	1.00
Texas	Sadler	Cruz	0.00	1.00
Utah	Lowell	Hatch	0.00	1.00
Vermont	MacGovern	Sanders	0.00	1.00
Virginia	Allen	Kaine	0.12	0.88
Washington	Baumgartner	Cantwell	0.12	0.88
West Virginia	Raese	Manchin	0.08	0.92
Wisconsin	Baldwin	Thompson	0.21	0.79
Wyoming	Chestnut	Barrasso	0.00	1.00

3. Perform win and loss probability analysis.

We will filter for those with less than 100% win or loss accuracy prediction, to analyze the others who were not in an one sided race.

senate_2012_winners <- select(senate_2012, state, Winner, win_forecast_prob)

senate_2012_winners %>%
  mutate(win_accuracy = win_forecast_prob * 100 ) %>%
  filter(win_accuracy < 100) %>%
  select(state, Winner, win_accuracy)

## Source: local data frame [17 x 3]
## 
##            state    Winner win_accuracy
##           (fctr)    (fctr)        (dbl)
## 1        Arizona     Flake           96
## 2    Connecticut    Murphy           96
## 3        Indiana  Donnelly           70
## 4          Maine      King           93
## 5  Massachusetts    Warren           94
## 6       Missouri McCaskill           98
## 7        Montana   Rehberg           66
## 8       Nebraska   Fischer           99
## 9         Nevada    Heller           83
## 10    New Mexico  Heinrich           97
## 11  North Dakota  Heitkamp            8
## 12          Ohio     Brown           97
## 13  Pennsylvania     Casey           99
## 14      Virginia     Kaine           88
## 15    Washington  Cantwell           88
## 16 West Virginia   Manchin           92
## 17     Wisconsin  Thompson           79

senate_2012_losers <- select(senate_2012, state, Loser, loss_forecast_prob)

senate_2012_losers %>%
  mutate(loss_accuracy = loss_forecast_prob * 100 ) %>%
  filter(loss_accuracy < 100) %>%
  select(state, Loser, loss_accuracy)

## Source: local data frame [33 x 3]
## 
##            state   Loser loss_accuracy
##           (fctr)  (fctr)         (dbl)
## 1        Arizona Carmona             4
## 2     California   Emken             0
## 3    Connecticut McMahon             4
## 4       Delaware    Wade             0
## 5        Florida    Mack             0
## 6         Hawaii  Lingle             0
## 7        Indiana  Lingle            30
## 8          Maine    Dill             1
## 9       Maryland Bongino             0
## 10 Massachusetts   Brown             6
## ..           ...     ...           ...

4. Visual Plotting of the winning and losing predictions

library(ggplot2)
qplot(win_forecast_prob, data = senate_2012_winners, geom = "dotplot")

## stat_bindot: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Almost all the win predictions came true except for one senate race, the single dot to the left. This candidate won the race even though the prediction was very low probability for win.

library(ggplot2)
qplot(loss_forecast_prob, data = senate_2012_losers, geom = "dotplot")

## stat_bindot: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

This is a similar plot with the reverse view of the win_prediction probability for the senate race losers. The one senate race that we saw above is also shown here as a wrong prediction. The person who was predicted to win lost and the single dot to the right represents that.

Overall, This is a high quality predictive analysis by FiveThirtyEight.com based on the poll data from various polling sources and other indicators or variables.

Wide data set 2 (PanTHERIA - Suggested By Veneranda Skrelja )

Jones, K.E., et al. PanTHERIA: a species-level database of life history, ecology, and geography of extant and recently extinct mammals. Ecology 90:2648. http://esapubs.org/archive/ecol/E090/184/

pantheria <-  "http://esapubs.org/archive/ecol/E090/184/PanTHERIA_1-0_WR05_Aug2008.txt"
download.file(pantheria, destfile = "mammals.txt")

# Read the data from the downloaded txt file in to a data set called 'mammals'
mammals <- read.table("mammals.txt", sep = "\t", header = TRUE,  stringsAsFactors = FALSE)

# Tidy the column names
names(mammals) <- sub("X[0-9._]+", "", names(mammals))
names(mammals) <- sub("MSW05_", "", names(mammals))

# Select the columns names to be used for further transformation
mammals <- dplyr::select(mammals, Order, Binomial, AdultBodyMass_g, AdultHeadBodyLen_mm, HomeRange_km2, LitterSize)
head(mammals)

##          Order            Binomial AdultBodyMass_g AdultHeadBodyLen_mm
## 1 Artiodactyla Camelus dromedarius       492714.47             -999.00
## 2    Carnivora       Canis adustus        10392.49              745.32
## 3    Carnivora        Canis aureus         9658.70              827.53
## 4    Carnivora       Canis latrans        11989.10              872.39
## 5    Carnivora         Canis lupus        31756.51             1055.00
## 6 Artiodactyla       Bos frontalis       800143.05             2700.00
##   HomeRange_km2 LitterSize
## 1        196.32       0.98
## 2          1.01       4.50
## 3          2.95       3.74
## 4         18.88       5.72
## 5        159.86       4.98
## 6       -999.00       1.22

# Renaming the column names to lower case and adding '_' in between words and other formatting
names(mammals) <- gsub("([A-Z])", "_\\L\\1", names(mammals), perl = TRUE)
names(mammals) <- gsub("^_", "", names(mammals), perl = TRUE)
mammals[mammals == -999] <- NA
names(mammals)[names(mammals) == "binomial"] <- "species"

head(mammals)

##          order             species adult_body_mass_g
## 1 Artiodactyla Camelus dromedarius         492714.47
## 2    Carnivora       Canis adustus          10392.49
## 3    Carnivora        Canis aureus           9658.70
## 4    Carnivora       Canis latrans          11989.10
## 5    Carnivora         Canis lupus          31756.51
## 6 Artiodactyla       Bos frontalis         800143.05
##   adult_head_body_len_mm home_range_km2 litter_size
## 1                     NA         196.32        0.98
## 2                 745.32           1.01        4.50
## 3                 827.53           2.95        3.74
## 4                 872.39          18.88        5.72
## 5                1055.00         159.86        4.98
## 6                2700.00             NA        1.22

1. Data manipulation

# Mutate - to add new columns using current data
head(mutate(mammals, adult_body_mass_kg = adult_body_mass_g / 1000))

##          order             species adult_body_mass_g
## 1 Artiodactyla Camelus dromedarius         492714.47
## 2    Carnivora       Canis adustus          10392.49
## 3    Carnivora        Canis aureus           9658.70
## 4    Carnivora       Canis latrans          11989.10
## 5    Carnivora         Canis lupus          31756.51
## 6 Artiodactyla       Bos frontalis         800143.05
##   adult_head_body_len_mm home_range_km2 litter_size adult_body_mass_kg
## 1                     NA         196.32        0.98          492.71447
## 2                 745.32           1.01        4.50           10.39249
## 3                 827.53           2.95        3.74            9.65870
## 4                 872.39          18.88        5.72           11.98910
## 5                1055.00         159.86        4.98           31.75651
## 6                2700.00             NA        1.22          800.14305

# Summarising data by grouping
head(summarise(group_by(mammals, order),
  mean_mass = mean(adult_body_mass_g, na.rm = TRUE)))

## Source: local data frame [6 x 2]
## 
##          order    mean_mass
##          (chr)        (dbl)
## 1 Afrosoricida 9.475564e+01
## 2 Artiodactyla 1.213294e+05
## 3    Carnivora 4.738645e+04
## 4      Cetacea 7.373065e+06
## 5   Chiroptera 5.772033e+01
## 6    Cingulata 4.699230e+03

# Piping data
piping = mammals %>%
    mutate(mass_to_length = adult_body_mass_g / adult_head_body_len_mm) %>%
    arrange(desc(mass_to_length)) %>%
    select(species, mass_to_length, litter_size)

head(piping)

##                  species mass_to_length litter_size
## 1     Balaena mysticetus       6538.967        1.00
## 2  Balaenoptera musculus       5063.035        1.00
## 3 Megaptera novaeangliae       2333.503        1.00
## 4  Eschrichtius robustus       2309.354        1.00
## 5  Balaenoptera physalus       2301.529        1.01
## 6        Elephas maximus       1703.728        1.41

2. Data Analysis

# Using group by and summarise to find the mean litter_size by species
mean = group_by(mammals,species) %>% 
    filter(litter_size != 'NA') %>% 
    summarise(mean(litter_size))

head(mean)

## Source: local data frame [6 x 2]
## 
##                species mean(litter_size)
##                  (chr)             (dbl)
## 1    Abeomelomys sevia              1.00
## 2   Abrocoma bennettii              4.86
## 3     Abrocoma cinerea              2.19
## 4 Abrothrix longipilis              3.70
## 5  Abrothrix olivaceus              5.34
## 6  Acerodon celebensis              0.98