Pigeon_Color

The following R code can be seen rendered on RPubs here

library(ggplot2)
library(randomForest)
library(caret)
library(car)
library(dplyr)
library(ipred)
library(klaR)
library(corrplot)

# pull the young bird racing data off GitHub
download.file("https://raw.githubusercontent.com/dbouquin/Pigeons/master/American_Racing_Pigeon_Union_Young_Bird_Data.csv", "American_Racing_Pigeon_Union_Young_Bird_Data.csv", method="curl")
y_birds <- read.csv("American_Racing_Pigeon_Union_Young_Bird_Data.csv", na.strings = "NA")

colnames(y_birds)[8] <- "COLOR"
colnames(y_birds)[9] <- "SEX"
str(y_birds)

## 'data.frame':    1229 obs. of  15 variables:
##  $ ORGANIZATION: Factor w/ 5 levels "75 Combine","Apple Valley RPC",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LOCATION    : Factor w/ 5 levels "Arlington, TX",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ YEAR        : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
##  $ POS         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ NAME        : Factor w/ 422 levels "1151       /6",..: 253 254 311 310 329 328 328 310 328 142 ...
##  $ BAND        : int  915 916 15458 15396 15385 15417 15418 15325 15388 2618 ...
##  $ NUMBER      : Factor w/ 172 levels "0 MX 12 FMC",..: 151 151 157 157 157 157 157 157 157 140 ...
##  $ COLOR       : Factor w/ 76 levels "","____","B",..: 4 4 6 8 4 32 32 55 4 17 ...
##  $ SEX         : Factor w/ 3 levels "","C","H": 2 2 3 3 3 3 3 3 3 3 ...
##  $ ARRIVAL     : Factor w/ 1135 levels "-07:50:50","10:00:04",..: 195 197 177 178 179 180 181 181 182 297 ...
##  $ MILES       : Factor w/ 801 levels "10/11","10/13",..: 142 448 119 448 119 448 517 517 568 239 ...
##  $ TOWIN       : Factor w/ 877 levels "0","0.01","0.02",..: 1 9 84 86 87 89 336 336 344 354 ...
##  $ YPM         : num  1303 1302 1287 1287 1287 ...
##  $ WS.Std.Pts  : int  9 4 99 93 88 83 78 72 67 62 ...
##  $ NDB.Std.Pts : Factor w/ 101 levels "0","0  Distance less than 75 miles",..: 97 93 87 83 77 72 67 62 57 52 ...

colnames(y_birds)

##  [1] "ORGANIZATION" "LOCATION"     "YEAR"         "POS"         
##  [5] "NAME"         "BAND"         "NUMBER"       "COLOR"       
##  [9] "SEX"          "ARRIVAL"      "MILES"        "TOWIN"       
## [13] "YPM"          "WS.Std.Pts"   "NDB.Std.Pts"

# I will do a visual inspection and run some standard statistics to see if there is a relationship between color and speed. 
bp <- ggplot(data=y_birds, aes(x=COLOR, y=YPM, fill=COLOR)) + geom_boxplot()
bp + guides(fill=FALSE) + theme(axis.ticks = element_blank(), axis.text.x = element_blank())

# Next I will run some stats:
# Significance test: because we are testing continuous (speed) vs. nominal (color) we will first run one way ANOVA - assuming completely random design 

fit <- aov(YPM ~ COLOR, data = y_birds) # color must be datatype factor

plot(fit) # diagnostic plots

# F statistic = Variation among sample means / Variation within groups
summary(fit)

##               Df   Sum Sq Mean Sq F value   Pr(>F)    
## COLOR         75 11549820  153998   3.116 4.39e-16 ***
## Residuals   1153 56975823   49415                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Now lets check to see if we can train a model to figure out if combining a few variables will help us predict a high scoring bird.

# make sure the points column is numeric
y_birds$NDB.Std.Pts <- as.numeric(y_birds$NDB.Std.Pts)

# create a binary value to indicate high scoring bird (80+ points)
y_birds$highscore  <- with(y_birds, ifelse(y_birds$NDB.Std.Pts >= 80, 'high_score', y_birds$NDB.Std.Pts >= 80))
y_birds$highscore <- as.factor(y_birds$highscore)

# recode colors to limit number of categorical predictors (there are way too many)
length(levels(y_birds$COLOR))

## [1] 76

y_birds$COLOR <- recode(y_birds$COLOR, "c('B', 'BB', 'BBAR', 'BBPD', 'BBSP', 'BBTC', 'BBWF')='black_bar'; c('BC', 'BCSP', 'BCPD', 'BCWF')='blue_check'; c('DC', 'DCSP', 'DCPD', 'DCWF')='dark_check'; c('RB', 'RBSP', 'RBPD', 'RBWF')='red_bar'; c('RC', 'RCSP', 'RCPD', 'RCWF')='red_check'; c('RED', 'RSPL')='red'; c('BSPL', 'DBB')='blue_var'; c('BLK')='black'; c('WHT')='white'; c('GRIZ')= 'grizzled'; c('IND')='indigo'; c('LAV')='lavender'; c('AND')='andalusian'; c('SLT')='slate'; c('BRWN', 'CHOC', 'BRNZ')='brown_var'; c('TIC')= 'ticked'; c('PEN', 'YLW')= 'yellow_var'; else='complex'")

# Check to make sure recoding was effective
levels(y_birds$COLOR)

##  [1] "black"      "black_bar"  "blue_check" "complex"    "dark_check"
##  [6] "grizzled"   "red"        "red_bar"    "red_check"  "slate"     
## [11] "white"      "yellow_var"

length(levels(y_birds$COLOR))

## [1] 12

# Guidance for RandomForest classification from:
# http://cogns.northwestern.edu/cbmg/LiawAndWiener2002.pdf
# set the seed for our random forests
set.seed(415)

# Use just a few variables to keep from overloading memory
m <-randomForest(highscore ~ COLOR + SEX + ORGANIZATION + YEAR, data=y_birds, mtry=2,importance=TRUE, do.trace=100)

## ntree      OOB      1      2
##   100:  19.20%  1.69% 98.21%
##   200:  18.71%  1.39% 96.86%
##   300:  18.88%  1.49% 97.31%
##   400:  18.96%  1.59% 97.31%
##   500:  18.96%  1.49% 97.76%

print(m) # confusion matrix shows our model is very bad at predicting highscoring birds. It does very well classifying non-highscore birds though.

## 
## Call:
##  randomForest(formula = highscore ~ COLOR + SEX + ORGANIZATION +      YEAR, data = y_birds, mtry = 2, importance = TRUE, do.trace = 100) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 18.96%
## Confusion matrix:
##            FALSE high_score class.error
## FALSE        991         15  0.01491054
## high_score   218          5  0.97757848

# 10 fold cross-validation
set.seed(131)
error.m <-numeric(10)
for(i in 1:10)error.m[i]<- errorest(highscore ~ COLOR + SEX + ORGANIZATION + YEAR,data=y_birds,model=randomForest,mtry=2)$error
summary(error.m)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1831  0.1882  0.1908  0.1901  0.1920  0.1953

Now I will inspect the correlation coefficients between some of the variables and visualize them to get a better understanding of the relationships in the data.

keeps1 <- c("COLOR", "SEX", "YPM", "NDB.Std.Pts", "YEAR", "highscore", "ORGANIZATION")
birds2 <- y_birds[keeps1]

birds2$COLOR <- recode(y_birds$COLOR, "c('black_bar')='1'; c('blue_check')='2'; c('dark_check')='3'; c('red_bar')='4'; c('red_check')='5'; c('red')='6'; c('blue_var')='7'; c('black')='8'; c('white')='9'; c('grizzled')= '10'; c('indigo')='11'; c('lavender')='12'; c('andalusian')='13'; c('slate')='14'; c('brown_var')='15'; c('ticked')= '16'; c('yellow_var')= '17'; else='18'")
birds2$SEX <- recode(y_birds$SEX, "c('H')='1'; c('C')='2'")
birds2$highscore <- recode(y_birds$highscore, "c('high_score')='1'; c('FALSE')='2'")
birds2$ORGANIZATION <- recode(y_birds$ORGANIZATION, "c('75 Combine')='1'; c('Apple Valley RPC')='2'; c('Dallas Homing RPC')='3'; c('Evergreen State Concourse')='4'; c('Wichita Friendly Pigeon Flyers')='5'")

# Make variables numeric
birds2$COLOR <- as.numeric(birds2$COLOR)
birds2$SEX <- as.numeric(birds2$SEX)
birds2$highscore <- as.numeric(birds2$highscore)
birds2$YPM <- as.numeric(birds2$YPM)
birds2$YEAR <- as.numeric(birds2$YEAR)
birds2$ORGANIZATION <- as.numeric(birds2$ORGANIZATION)

colnames(birds2) <- c("Color", "Sex", "Yds/Min", "Points", "Year", "HighScore", "Org.")
str(birds2)

## 'data.frame':    1229 obs. of  7 variables:
##  $ Color    : num  1 1 1 1 1 5 5 9 1 5 ...
##  $ Sex      : num  3 3 2 2 2 2 2 2 2 2 ...
##  $ Yds/Min  : num  1303 1302 1287 1287 1287 ...
##  $ Points   : num  97 93 87 83 77 72 67 62 57 52 ...
##  $ Year     : num  2015 2015 2015 2015 2015 ...
##  $ HighScore: num  1 1 1 1 2 2 2 2 2 2 ...
##  $ Org.     : num  1 1 1 1 1 1 1 1 1 1 ...

colors <- c("turquoise4","seagreen3")
M <- cor(birds2)
corrplot.mixed(M, tl.pos="d", tl.col="black", tl.cex=.7, col=colors)

A final analysis can be done using K-Modes Clustering

# K-Modes (categorical data) Cluster Analysis 
# http://www.inside-r.org/packages/cran/klaR/docs/kmodes
keeps2 <- c("COLOR", "SEX", "YPM", "NDB.Std.Pts", "YEAR", "highscore", "ORGANIZATION")
birds3 <- y_birds[keeps2]

set.seed(10)

# tried with 2 clusters
cl <- kmodes(birds3, 2)
cl$modes

##       COLOR SEX      YPM NDB.Std.Pts YEAR highscore
## 1   complex   H 1430.453           1 2013     FALSE
## 2 black_bar   C 1261.855           1 2012     FALSE
##                ORGANIZATION
## 1 Evergreen State Concourse
## 2          Apple Valley RPC

cl$size

## cluster
##   1   2 
## 860 369

Pigeon_Color

Daina Bouquin

December 8, 2015