Cleaning the raw data of texas listings
options(stringsAsFactors = FALSE)
library(plyr)
paths <- dir("data/texas-msa-sales/raw-listings", pattern = "\\.csv$", full = T)
names(paths) <- substr(basename(paths), 3, 5)
listings <- ldply(paths, read.csv, na.strings = "-")
head(listings)
## .id Date Sales DollarVolume AveragePrice MedianPrice TotalListings
## 1 110 1990-Jan 110 5047628 45900 36700 764
## 2 110 Feb 65 3089121 47500 47900 976
## 3 110 Mar 91 4437576 48800 43900 1036
## 4 110 Apr 101 4355732 43100 37700 1038
## 5 110 May 113 5059475 44800 36500 966
## 6 110 Jun 107 4421083 41300 32500 959
## MonthsInventory
## 1 9.5
## 2 12.1
## 3 12.8
## 4 12.4
## 5 11.1
## 6 10.9
names(listings) <- c("msa", "date", "sales", "volume", "price_avg", "price_med",
"listings", "inventory")
nc <- nchar(listings$date)
listings$month <- as.numeric(factor(tolower(substr(listings$date, nc - 2, nc)),
levels = c("jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep",
"oct", "nov", "dec")))
listings$year <- rep(1990:2009, each = 12)[1:232]
listings$date <- NULL
write.table(listings, "texas-listings.csv", sep = ",", row = F)
head(listings)
## msa sales volume price_avg price_med listings inventory month year
## 1 110 110 5047628 45900 36700 764 9.5 1 1990
## 2 110 65 3089121 47500 47900 976 12.1 2 1990
## 3 110 91 4437576 48800 43900 1036 12.8 3 1990
## 4 110 101 4355732 43100 37700 1038 12.4 4 1990
## 5 110 113 5059475 44800 36500 966 11.1 5 1990
## 6 110 107 4421083 41300 32500 959 10.9 6 1990