Cleaning the raw data of texas listings

Cleaning the listings data for Texas

options(stringsAsFactors = FALSE)
library(plyr)

paths <- dir("data/texas-msa-sales/raw-listings", pattern = "\\.csv$", full = T)
names(paths) <- substr(basename(paths), 3, 5)
listings <- ldply(paths, read.csv, na.strings = "-")
head(listings)
##   .id     Date Sales DollarVolume AveragePrice MedianPrice TotalListings
## 1 110 1990-Jan   110      5047628        45900       36700           764
## 2 110      Feb    65      3089121        47500       47900           976
## 3 110      Mar    91      4437576        48800       43900          1036
## 4 110      Apr   101      4355732        43100       37700          1038
## 5 110      May   113      5059475        44800       36500           966
## 6 110      Jun   107      4421083        41300       32500           959
##   MonthsInventory
## 1             9.5
## 2            12.1
## 3            12.8
## 4            12.4
## 5            11.1
## 6            10.9
names(listings) <- c("msa", "date", "sales", "volume", "price_avg", "price_med", 
    "listings", "inventory")
nc <- nchar(listings$date)
listings$month <- as.numeric(factor(tolower(substr(listings$date, nc - 2, nc)), 
    levels = c("jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", 
        "oct", "nov", "dec")))
listings$year <- rep(1990:2009, each = 12)[1:232]
listings$date <- NULL
write.table(listings, "texas-listings.csv", sep = ",", row = F)
head(listings)
##   msa sales  volume price_avg price_med listings inventory month year
## 1 110   110 5047628     45900     36700      764       9.5     1 1990
## 2 110    65 3089121     47500     47900      976      12.1     2 1990
## 3 110    91 4437576     48800     43900     1036      12.8     3 1990
## 4 110   101 4355732     43100     37700     1038      12.4     4 1990
## 5 110   113 5059475     44800     36500      966      11.1     5 1990
## 6 110   107 4421083     41300     32500      959      10.9     6 1990