Cleaning the raw data of texas listings

Cleaning the listings data for Texas

downloaded the raw files for all the listings
read all the files together and combined into one listing for texas using the ldply() function
changed the names and did some cleaning on the months and the year and discarded the date column
wrote them into the a new file

options(stringsAsFactors = FALSE)
library(plyr)

paths <- dir("data/texas-msa-sales/raw-listings", pattern = "\\.csv$", full = T)
names(paths) <- substr(basename(paths), 3, 5)
listings <- ldply(paths, read.csv, na.strings = "-")
head(listings)

##   .id     Date Sales DollarVolume AveragePrice MedianPrice TotalListings
## 1 110 1990-Jan   110      5047628        45900       36700           764
## 2 110      Feb    65      3089121        47500       47900           976
## 3 110      Mar    91      4437576        48800       43900          1036
## 4 110      Apr   101      4355732        43100       37700          1038
## 5 110      May   113      5059475        44800       36500           966
## 6 110      Jun   107      4421083        41300       32500           959
##   MonthsInventory
## 1             9.5
## 2            12.1
## 3            12.8
## 4            12.4
## 5            11.1
## 6            10.9

names(listings) <- c("msa", "date", "sales", "volume", "price_avg", "price_med", 
    "listings", "inventory")
nc <- nchar(listings$date)
listings$month <- as.numeric(factor(tolower(substr(listings$date, nc - 2, nc)), 
    levels = c("jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", 
        "oct", "nov", "dec")))
listings$year <- rep(1990:2009, each = 12)[1:232]
listings$date <- NULL
write.table(listings, "texas-listings.csv", sep = ",", row = F)
head(listings)

##   msa sales  volume price_avg price_med listings inventory month year
## 1 110   110 5047628     45900     36700      764       9.5     1 1990
## 2 110    65 3089121     47500     47900      976      12.1     2 1990
## 3 110    91 4437576     48800     43900     1036      12.8     3 1990
## 4 110   101 4355732     43100     37700     1038      12.4     4 1990
## 5 110   113 5059475     44800     36500      966      11.1     5 1990
## 6 110   107 4421083     41300     32500      959      10.9     6 1990