Setup
rm(list = ls())
setwd("/Users/brian/Dropbox/GitRepository/data_science_class/practicum/hw1/")
options(scipen = 999)
library("RCurl")
## Loading required package: bitops
library("stringr")
library("plyr")
library("reshape2")
library("sqldf")
## Loading required package: DBI
## Loading required package: gsubfn
## Loading required package: proto
## Loading required namespace: tcltk
## Loading Tcl/Tk interface ...
## done
## Loading required package: chron
## Loading required package: RSQLite
## Loading required package: RSQLite.extfuns
library("knitr")
1. Acquire MTA fare card data from http://mta.info/developers/fare.html
# construct urls
page <- getURL("http://mta.info/developers/fare.html")
pattern <- "data/nyct/fares/fares_[0-9]+\\.csv"
links <- unlist(str_extract_all(page, pattern))
urls <- paste0("http://mta.info/developers/", links)
# create download / cleaning function
get_mta_data <- function(url) {
# read in csv
d <- read.csv(url(url), stringsAsFactors = F)
# clean up names, which are in the second row
names(d) <- tolower(d[2, ])
names(d) <- str_trim(names(d))
names(d) <- gsub("-|/| ", "_", names(d))
d <- d[-c(1, 2), ]
# extract dates from url and reformat
pattern <- ".+/fares_([0-9]+)\\.csv"
d$date <- gsub(pattern, "\\1", url)
d$date <- as.Date(d$date, "%y%m%d")
return(d)
}
# get data
data <- ldply(urls, get_mta_data, .progress = "text")
# remove one column which is 'na' for some reason
data <- data[, -26]
# turn characters into numbers... this is weird
data <- as.data.frame(data)
data[, 3:25] <- apply(data[, 3:25], 2, as.numeric)
# fix names with numbered headers
test1 <- grep("^1_d", names(data))
names(data)[test1] <- gsub("^1_d", "day", names(data)[test1])
test7a <- grep("^7_d", names(data))
names(data)[test7a] <- gsub("^7_d", "week", names(data)[test7a])
test7b <- grep("^7d", names(data))
names(data)[test7b] <- gsub("7d", "week", names(data)[test7b])
test14 <- grep("^14_d", names(data))
names(data)[test14] <- gsub("^14_d", "twoweek", names(data)[test14])
test30 <- grep("30_d", names(data))
names(data)[test30] <- gsub("30_d", "month", names(data)[test30])
# write munged data to file
write.csv(data, "hw1data.csv", row.names = F)
2. Write a script that prints each row as a dictionary going from column name to value
# not run
for (i in 1:ncol(data)) {
key = names(data)[i]
values = data[, i]
print(paste(key, values, sep = " : "))
}
3.Create a histogram of swipes per station associated with 30-day metro cards.
What’s the outlier? What’s the mean number of 30-day swipes per station?
What’s the median? How do they relate, and does that make sense given the histogram?
# read in the data
path <- "/Users/brian/Dropbox/GitRepository/data_science_class/practicum/hw1/data/hw1data.csv"
data <- read.csv(path, stringsAsFactors = F)
# take just one week
all_dates <- unique(data$date)
samp <- sample(1:length(all_dates), 1)
the_week = data$date[samp]
one_week <- data[data$date == the_week, ]
# get into a format for plotting
q <- "SELECT station, SUM(ff) AS full_fare, SUM(week_unl) AS week_unl, SUM(month_unl) AS month_unl FROM one_week GROUP BY station"
counts <- sqldf(q)
## Loading required package: tcltk
c = "darkblue"
main = paste("Number of 30-day unlimited swipes per station, week of", the_week)
hist(counts$month_unl, breaks = 20, col = c, border = "white", xlab = "Swipes",
main = main)
main = paste("Number of 30-day unlimited swipes per station, week of", the_week)
hist(log(counts$month_unl), breaks = 20, col = c, border = "white", xlab = "log(Swipes",
main = main)
boxplot(counts$month_unl, ylab = "Swipes", bty = "n", col = c, pch = 20)
summary(counts$month_unl)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 5530 11900 22100 27200 248000
4. Pick several other variables that look interesting, and look at their histograms.
How similar they to the histogram for 30-day swipes?
main = paste("Number of Full-Fare swipes per station, week of", the_week)
hist(counts$full_fare, breaks = 20, col = c, border = "white", xlab = "Swipes",
main = main)
main = paste("Number of 7-day unlimited swipes per station, week of", the_week)
hist(counts$week_unl, breaks = 20, col = c, border = "white", xlab = "Swipes",
main = main)
5. Plot a scatterplot of 30-day unlimited swipes versus full fare swipes.
How do the two compare? Do they look dependent or independent?
par(bty = "n")
x <- counts$month_unl
y <- counts$week_unl
main <- paste("Week vs. Month Unlimited Swipes per Station, week of", the_week)
scatter.smooth(x, y, pch = 20, col = c, ylab = "7-day Unlimited Swipes", xlab = "30-day Unlimited Swipes",
main = main)
cor(counts$month_unl, counts$week_unl)
## [1] 0.8984
6.Acquire the same MTA fare data for all weeks available.
7. Write a program that takes all of the fare data and a column name, transforms the dates into a sortable format
data$date <- as.Date(data$date)
data[is.na(data)] <- 0
# calculate the total of all swipes by summing all numeric columns not
# sure if this is valid?
data$total <- rowSums(data[, 3:25])
data_to_map <- subset(data, select = c("station", "date", "total"))
mapped_data <- data_to_map[order(data_to_map$date, decreasing = FALSE), ]
head(mapped_data, 10)
## station date total
## 56623 WHITEHALL STREET 2010-06-12 126878
## 56624 FULTON ST & BROADWAY NASSAU 2010-06-12 37306
## 56625 CYPRESS HILLS 2010-06-12 6606
## 56626 75TH STREET & ELDERTS LANE 2010-06-12 16378
## 56627 85TH STREET & FOREST PKWAY 2010-06-12 18564
## 56628 WOODHAVEN BOULEVARD 2010-06-12 19987
## 56629 104TH STREET 2010-06-12 12449
## 56630 111TH STREET 2010-06-12 11091
## 56631 121ST STREET 2010-06-12 10482
## 56632 42ND STREET & 8TH AVENUE 2010-06-12 147187
8. Write a program which takes in the sorted result without loading it all into
memory and chunks the input by the value of the first column. In Python,
itertools.groupby does this for you, but it shouldn’t be terrible to write in other languages.
You should then be able to print out one line per station, in the following format:
_station_name,count_average,count_week_1,count_week_2,…
mapped_data$station <- as.factor(mapped_data$station)
mapped_data$date <- as.factor(mapped_data$date)
data_wide <- dcast(mapped_data, station ~ date, value.var = "total", fun.aggregate = sum)
head(data_wide, 10)[, 1:10] # only take first ten weeks for sake of interpretability
## station 2010-06-12 2010-06-19 2010-06-26
## 1 CANAL STREET 68218 68500 70978
## 2 RI TRAMWAY (MANHATTAN) 0 0 0
## 3 RI TRAMWAY (ROOSEVELT) 0 0 0
## 4 103RD ST-CENTRAL PARK WEST 27395 28789 28775
## 5 103RD ST-ROOSEVELT AVE 98495 102538 101708
## 6 103RD STREET-BROADWAY 73241 79563 79785
## 7 103RD STREET-LEXINGTON AVE 78954 86635 88993
## 8 104TH STREET 12449 13644 13719
## 9 104TH STREET-LIBERTY AVENUE 8312 9218 9217
## 10 109TH STREET-LIBERTY AVENUE 11929 13432 13409
## 2010-07-03 2010-07-10 2010-07-17 2010-07-24 2010-07-31 2010-08-07
## 1 70281 70853 66101 69575 70086 70811
## 2 0 0 0 1 0 0
## 3 0 1 0 1 0 0
## 4 28731 29490 27307 29475 29378 29018
## 5 100795 101089 97548 102176 101866 99763
## 6 73386 65785 58498 60093 58619 57240
## 7 82249 83073 75816 79972 80427 82114
## 8 11460 14094 10295 11504 11519 11449
## 9 9116 9129 8120 8848 8940 9066
## 10 13285 13259 11858 12795 13039 13004
9. For a given week, bucket the 30-day unlimited swipes and full fare swipes into two buckets each:
below the median and above the median. Count the number that go into each pair of buckets
Look at a table of these numbers. How does the table compare to the scatterplot?
What’s the probability of being in a high full fare bucket if you’re in a high 30-day unlimited swipes bucket?
med1 <- median(one_week$month_unl)
one_week$med_mth <- ifelse(one_week$month_unl >= med1, 1, 0)
med2 <- median(one_week$ff)
one_week$med_ff <- ifelse(one_week$ff >= med2, 1, 0)
tab <- table(one_week$med_ff, one_week$med_mth)
tab
##
## 0 1
## 0 201 30
## 1 30 201
high_ff_mth <- tab[2, 2]
p <- high_ff_mth/sum(tab)
p
## [1] 0.4351