PostgreSQL and R

Introduction

For my horse system, I utilize data.frames in R to store data about horses that I use to train the model. This works because I have a relatively small dataset of around 60,000 horses and around 50 features. As my dataset grows, it will become advantageous to utilize a database instead of the in-memory storage native to R.

The following is an example of how I could load a table with the same data that I currently store in a data.frame, and how I could then access that data again through SQL queries. What I will do is process text files in R, then send them to the database. The key difference between the method I will present here, and the method I have done before, is that this method only ever stores 1 day’s worth of information at a time in memory, whereas my other method loaded years’ worth of information all at once.

Preparations

Before I begin the data transfer, I must make some preparations. I will have R list all of the files in my directory that will need to be loaded. The files that I am interested in have the extensions: DRF; DR2; DR3; DR4. I also need to establish a connection to the database I am working with. Of course, I will also need the related libraries.

library(plyr)
library(dplyr)
library(stringr)
library(RPostgreSQL)
library(knitr)

DRF_files = list.files(pattern = '.DRF$')
DR2_files = list.files(pattern = '.DR2$')
DR3_files = list.files(pattern = '.DR3$')
DR4_files = list.files(pattern = '.DR4$')


#Connect to db
#The database and user have already been created, I am using R to access the database by sending the credentials
pw = 'myPass'
drv = dbDriver("PostgreSQL")
con = dbConnect(drv, dbname = "postgres",
                 host = "localhost", port = 5432,
                 user = "openpg", password = pw)

Transferring Data

Now that we have a list of files that we’d like to transfer, and a connection to a databse, we are ready to proceed. Note that I created the database table called ‘horse_level_combined’ ahead of time, so I am going to write to that table. I am not creating a new table. I will write a for loop that processes each file, creates new calculated fields, combines all 4 files into 1 data.frame, then writes that data to the database.

#Remove the table, if it exists, to ensure a clean transfer
dbGetQuery(con, 'delete from horse_level_combined')

## NULL

#Create the table
for(a in 1:max(length(DRF_files))) {
  #process each file
  drf = read.csv(DRF_files[a], header = F, stringsAsFactors = F)
  RaceData = drf[,c(1,2,3,4,5,7, 23)]
  names(RaceData) = c("Track", "RDate", "RNum", "Dist","Surface","RType", "PostTime")
  RaceData$Surface = substr(as.character(RaceData$Surface), 1,1) #Import error where "T" = TRUE 
  
  dr2 = read.csv(DR2_files[a], header = F)
  dr2_ = read.csv(DR2_files[a], header = F, quote = "")
  HorseData = dr2[,c(1,2,3,4,19,21,24,25,41,43,64,65,69,71,75,189,201,202)]
  names(HorseData) = c("Track", "RDate", "RNum", "PP", "TrainRecordCurYear", "JockRecordCurYear", "HName", "YOB", "Medication", 
                       "EquipChange", "Year", "StartsYear", "EarnYear", "StartsPrev", "EarnPrev", "BRISRunstyle", "ProgramNum", "MLOdds")
  x = str_replace(dr2_[,201], "[:punct:]", "")
  HorseData$ProgramNum = str_replace(x, "[:punct:]", "")
  
  dr3 = read.csv(DR3_files[a], header = F)
  PastPerf = dr3[,c(1,2,3,4,5,14,35,41,56,57,61,64,84)]
  names(PastPerf) = c("Track", "RDate", "RNum", "PP", "PrevRDate", "NumHor", "Purse", "FinishPos", "BRIS2fPace", "BRIS4fPace",
                      "BRISLatePace", "BRISSpeed", "ClaimedFlag")
  
  dr4 = read.csv(DR4_files[a], header = F)
  BRISStats = dr4[,c(1,2,3,4,5,10)]
  names(BRISStats) = c("Track", "RDate", "RNum", "PP", "PowerRating", "BestSpeedLife")
  
  #build features and combine the data
  RaceData$PostTime = gsub("[\\(, \\), \\/]", "", substr(RaceData$PostTime, 1, 7))
  
  #Trainer and jockey win percentages
  HorseData$TrainStartCurYear = as.numeric(substr(HorseData$TrainRecordCurYear, 1, 4))
  HorseData$TrainWinCurYear = as.numeric(substr(HorseData$TrainRecordCurYear, 5, 8))
  HorseData$TrainWinPct = HorseData$TrainWinCurYear/(HorseData$TrainStartCurYear + 1) # +1 eliminates undefined
  HorseData$JocStartCurYear = as.numeric(substr(HorseData$JockRecordCurYear, 1, 4))
  HorseData$JocWinCurYear = as.numeric(substr(HorseData$JockRecordCurYear, 5, 8))
  HorseData$JockWinPct = HorseData$JocWinCurYear/(HorseData$JocStartCurYear + 1)
  
  # Equipment changes
  HorseData$addBlinks = ifelse(HorseData$EquipChange == 1, 1, 0)
  HorseData$remBlinks = ifelse(HorseData$EquipChange == 2, 1, 0)
  
  # Horse earnings
  HorseData$earnPer = (HorseData$EarnYear + HorseData$EarnPrev)/(HorseData$StartsYear+HorseData$StartsPrev)
  HorseData$logEarn = log(HorseData$earnPer + 1)
  
  # Age
  HorseData$birthYear = 2000+HorseData$YOB
  HorseData$age = as.numeric(HorseData$Year) - HorseData$birthYear
  
  # Find days since previous race
  PastPerf$RDateD = as.Date(as.character(PastPerf$RDate), "%Y%m%d")
  PastPerf$PrevDateD = as.Date(as.character(PastPerf$PrevRDate), "%Y%m%d")
  PastPerf$DaysOld = PastPerf$RDateD - PastPerf$PrevDateD
  
  # Finish position is messy, including records for horses that didn't finish.  
  # We will say in these cases that they simply finished last.
  PastPerf$FinishPos = ifelse(is.na(as.numeric(as.character(PastPerf$FinishPos))), 
                              PastPerf$NumHor, 
                              as.numeric(as.character(PastPerf$FinishPos)))
  PastPerf$FinishPos = ifelse(PastPerf$FinishPos %in% c(89,99), PastPerf$NumHor, PastPerf$FinishPos)
  
  # subset just races in the past 120 days for recency
  pastPerf120 = filter(PastPerf, DaysOld <= 120)
  pastPerf120 = group_by(pastPerf120, Track, RDate, RNum, PP)
  pastPerf120 = summarize(pastPerf120, countRaces120 = length(Track))
  
  # Subset just races in past 365 days
  pastPerf365 = filter(PastPerf, DaysOld <= 365, !is.na(BRISSpeed))
  pastPerf365 = group_by(pastPerf365, Track, RDate, RNum, PP)
  pastPerf365 = summarize(pastPerf365, avgSpeed120 = mean(BRISSpeed), avg2FSpeed120 = mean(BRIS2fPace), 
                          avg4FSpeed120 = mean(BRIS4fPace), avgLateSpeed120 = mean(BRISLatePace))
  
  
  # subset just races in past 500 days
  pastPerf500 = filter(PastPerf, DaysOld <= 500)
  pastPerf500 = group_by(pastPerf500, Track, RDate, RNum, PP)
  pastPerf500 = summarize(pastPerf500, logPurse = log(mean(Purse)))
  
  #Clean up, take only what we want
  HorseData = select(HorseData, Track, RDate, RNum, PP, HName, BRISRunstyle, TrainWinPct, JockWinPct, addBlinks, remBlinks, logEarn, age, MLOdds,
                     ProgramNum)
  PastPerf = left_join(pastPerf500, pastPerf365)
  PastPerf = left_join(PastPerf, pastPerf120)
  PastPerf[is.na(PastPerf)] = 0
  PastPerf$countsq = PastPerf$countRaces120*PastPerf$countRaces120
  
  #Combine the sets 
  h = inner_join(HorseData, BRISStats)
  h = inner_join(h, RaceData)
  h = left_join(h, PastPerf)
  h = unique(h)
  h[is.na(h)] = 0
  h$Track = str_trim(h$Track)
  
  #Build features that compare each horse to others in the same race
  i = group_by(h, Track, RDate, RNum)
  j = summarise(i, meanEarn = mean(logEarn), meanPurse = mean(logPurse), 
                meanSpeed120 = mean(avgSpeed120), meanES = mean(avg2FSpeed120), mean4f = mean(avg4FSpeed120), 
                meanLS = mean(avgLateSpeed120), numHor = length(PP), meanPower = mean(PowerRating))
  h = inner_join(i, j)
  
  h$compEarn = ifelse(h$logEarn == 0, 0, h$logEarn - h$meanEarn) #earnings compared to competition
  h$compPurse = ifelse(h$logPurse == 0, 0, h$logPurse- h$meanPurse) #average purse compared to competition
  h$compSpeed120 = ifelse(h$avgSpeed120 == 0, 0, h$avgSpeed120 - h$meanSpeed120) #speed compared to competition
  h$compES = ifelse(h$avg2FSpeed120 == 0, 0, h$avg2FSpeed120 - h$meanES) #early speed compared to competition
  h$comp4f = ifelse(h$avg4FSpeed120 == 0, 0, h$avg4FSpeed120 - h$mean4f) #middle speed compared to competition
  h$compLS = ifelse(h$avgLateSpeed120 == 0, 0, h$avgLateSpeed120 - h$meanLS) #late speed compared to competition
  h$compPower = ifelse(h$PowerRating == 0, 0, h$PowerRating - h$meanPower) #BRIS Power rating compared to competition
  h$numHorsq = h$numHor*h$numHor
  h$agesq = h$age * h$age
  h$raceID = factor(paste(h$Track, h$RDate, h$RNum, sep = ""))
  
  h = as.data.frame(h)
  
  dbWriteTable(con, 'horse_level_combined', value = h, append = T, row.names = F)
}

Accessing the data

Great, so now I’ve processed all of my files and I have a database with all of my data. I’ll need to write queries to access my data.

Let’s say I want to do some exploratory analysis and see how many races were run at each track in my data. I will write a query to find that information.

races_per_track = dbGetQuery(con, 'select "Track" , Count(distinct "raceID") as Count from "horse_level_combined" group by "Track" order by "Track" asc;')

kable(races_per_track)

Track	count
AJX	18
BEL	56
CON	54
CT	32
DED	55
EVD	45
FL	53
GG	33
GP	12
GPW	58
GRM	9
HAW	41
HST	18
IND	56
KEE	58
LA	26
LRL	23
LS	18
MED	12
NP	28
PEN	40
PM	30
PRX	44
RP	36
SA	44
TDN	34
TUP	33
WO	44
WRD	36
ZIA	55

Maybe I’m interested in a specific race, say the 5th race at Belmont Park on October 24th 2015, and I want to see horses ordered by their BRIS Power Ranking. Note that I have a raceID field which I could use for this query, but for the sake of being simple and easy to understand, I will use a combination of variables.

Belmont5 = dbGetQuery(con, 'select "Track", "RNum", "RDate", "HName", "PowerRating" from "horse_level_combined" where "Track" = \'BEL\' and "RNum" = 5 and "RDate" = \'20151024\' order by "PowerRating" desc;')

kable(Belmont5)

Track	RNum	RDate	HName	PowerRating
BEL	5	20151024	WARRIOROFTHEROSES	134.41
BEL	5	20151024	ROYAL POSSE	131.92
BEL	5	20151024	BEYOND EMPIRE	131.72
BEL	5	20151024	SIOUX	131.52
BEL	5	20151024	SARATOGA SNACKS	131.51
BEL	5	20151024	GOOD LUCK GUS	130.84
BEL	5	20151024	EMPIRE DREAMS	129.64
BEL	5	20151024	JOHANNESBURG SMILE	124.34
BEL	5	20151024	FULL OF MINE	117.66

Conclusion

I populated my database with data from text files, then queried some of the information to demonstrate how you might use the RPostreSQL library. This was just a simple excercise, but if my data were 1,000,000 observations instead of a few hundred, then I would run the risk of R crashing every time it does a computation, due to the burden on my computer’s memory. It is important to be comfortable using databases with the analysis software of your choice if you’re doing any projects at an industrial scale.