# Tidy work
# Load the data
uber_apr <- read.csv("uber-raw-data-apr14.csv", header = TRUE)
uber_may <- read.csv("uber-raw-data-may14.csv", header = TRUE)
uber_jun <- read.csv("uber-raw-data-jun14.csv", header = TRUE)
uber_jul <- read.csv("uber-raw-data-jul14.csv", header = TRUE)
uber_aug <- read.csv("uber-raw-data-aug14.csv", header = TRUE)
uber_sep <- read.csv("uber-raw-data-sep14.csv", header = TRUE)
# Combine the data of different months into one
uber_raw_data <- as.tibble(rbind(uber_apr,uber_may,uber_jun,uber_jul, uber_aug, uber_sep))
#summary(uber_raw_data)
#head(uber_raw_data)
colnames(uber_raw_data)
## [1] "Date.Time" "Lat" "Lon" "Base"
We are going to explore the uber pick-ups on the basis of daily. We need do some tidying work on our data `uber_raw_data
uber_raw_data$Weekday <- weekdays(as.Date(uber_raw_data$Date.Time,format="%m/%d/%Y"))
summary(uber_raw_data$Weekday)
## Length Class Mode
## 4534327 character character
Now we calculate the baic statistics.
# Calculate the pickups in each weekday
pickup_daily <-aggregate(uber_raw_data$Weekday, FUN = length, by = list(uber_raw_data$Weekday))
# Re-name the columns
colnames(pickup_daily) <- c("Weekday", "pick_ups")
# obtain the max and min pickups
summary(pickup_daily)
## Weekday pick_ups
## Length:7 Min. :490180
## Class :character 1st Qu.:593793
## Mode :character Median :663789
## Mean :647761
## 3rd Qu.:718814
## Max. :755145
From the summary above, we see that the largest mumber of Uber pick-ups is 755145 (on Thursday). And the minimum is 490180 (on Sunday)
# re-ordering
pickup_daily$Weekday <- as.factor(pickup_daily$Weekday)
# Plot the pickup amount by weekdays in barchart
ggplt <-ggplot(pickup_daily, aes(Weekday, pick_ups, fill=pick_ups)) +
geom_bar(stat = "identity", position="dodge") +
# Add the title
labs(title = "Uber Pick Up Break Down By Weekday")
ggplt
# Add a month column
uber_apr$month <- "Apr"
#uber_apr$month <- as.factor(uber_apr$month)
#head(uber_apr)
uber_may$month <- "May"
#uber_may$month <- as.factor(uber_may$month)
uber_jun$month <- "Jun"
#uber_jun$month <- as.factor(uber_jun$month)
uber_jul$month <- "Jul"
#uber_jul$month <- as.factor(uber_jul$month)
uber_aug$month <- "Aug"
#uber_aug$month <- as.factor(uber_aug$month)
uber_sep$month <- "Sep"
#uber_sep$month <- as.factor(uber_sep$month)
# Update the dataset
# the approach is problematic and non-bug-free
uber_raw_data <- rbind(uber_apr,uber_may,uber_jun,uber_jul,uber_aug,uber_sep)
#head(uber_raw_data)
#colnames(uber_raw_data)
#summary(uber_raw_data)
uber_raw_data$Weekday <- weekdays(as.Date(uber_raw_data$Date.Time,format="%m/%d/%Y"))
# Calculate the pickups of each month
pickup_monthly <- aggregate(uber_raw_data$month, FUN = length, by = list(uber_raw_data$month))
# Re-name columns
colnames(pickup_monthly) <- c("month", "pick_ups")
# Grouping
pickup_monthly$month <- as.factor(pickup_monthly$month)
#summary(pickup_monthly)
# Plot it in barchart
ggplt_month <- ggplot(pickup_monthly, aes(x=month, y=pick_ups, fill = pick_ups)) +
# barchart
geom_bar(stat = "identity", position="dodge") +
# Add a title
labs(title = "Uber Pick-Up Trends over the Months")
ggplt_month
The Uber pick-up frequencies from April to Spetember rise firsty and then fall, and at last rise again to the highest in September.
The basic steps to find the hidden pattern is the same as before.
# Data manipulation again
uber_raw_data$time <- format(strptime(uber_raw_data$Date.Time, format = "%m/%d/%Y %H:%M:%S"), "%H")
# Group the data each hour and calculate the pick-up amount using the function length
pickup_hourly <- aggregate(uber_raw_data$time, FUN = length, by = list(uber_raw_data$time))
#assign the column name
colnames(pickup_hourly) <- c(x="hour", y="pick_ups")
# Plotting the barchart
ggplt_hourly <- ggplot(pickup_hourly, aes(x=hour, y=pick_ups, fill = pick_ups)) +
# Add the bar
geom_bar(stat = "identity", position="dodge") +
# Add the title
labs(title = "Uber Pick-up Trends in Hours")
# Need to adjust the coloring of the plot `ggplt_hourly`
ggplt_hourly
colnames(uber_raw_data)
## [1] "Date.Time" "Lat" "Lon" "Base" "month" "Weekday"
## [7] "time"
Uber pick-ups increase from the morning overall. And 17:00 reaches the highest pickups each day which coiside with our intuition. And then the pick-ups decrease.
#Task 2.1
#We plot the data in density plot with 7 facet so that we can observe the variation among the weekdays.
#plot the density plot on top of NYC map where x axis is longitude and y axis is latitude. Level are presented in color.
# One solution is use the package `ggmap`. And another elegant approach is use leaf
# Select the data needed for this task
#colnames(uber_raw_data)
#NYC_day_pickups <- subset(uber_raw_data, select = c(Lat, Lon, Weekday))
#
# reorder the data according to the weekdays
#NYC_day_pickups$Weekday<- factor(NYC_day_pickups$Weekday, levels=c("Monday",
# "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday","Sunday"))
#NYC_day_pickups<-NYC_day_pickups[order(NYC_day_pickups$Weekday),]
#get the map of NYC.
NYC_map <- get_map(location = "NYC", zoom = 12, maptype = "roadmap")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=NYC&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=NYC&sensor=false
#save the visual map
NYC <- ggmap(NYC_map)
# Plot the pickup data as point plot on top of the NYC map.
NYC + stat_density2d(aes(x =Lon, y =Lat,fill = ..level.., alpha=..level..), size=8, bins= 30, alpha = 0.5, geom = "polygon", data=uber_raw_data) + scale_fill_gradient(low="blue", high = "red")+facet_wrap(~ Weekday) + labs(x="Longitude", y="Latitude", title = "Uber Pickup Density Graphic-Break Down By Weekday", subtitle = "NYC - April/2014 ~ Sep/2014")
## Warning: Removed 496975 rows containing non-finite values (stat_density2d).
# Task2.2
# Similar to Task 2.1.
# focus on the data needed
#NYC_monthly_pickups <- subset(uber_raw_data, select = c(Lat, Lon, month))
#NYC_monthly_pickups$month<- factor(NYC_monthly_pickups$month, levels = c("April", "May", "June", "July","August","September"))
#NYC_monthly_pickups<-NYC_monthly_pickups[order(NYC_monthly_pickups$month),]
NYC + stat_density2d(aes(x =Lon, y = Lat,fill = ..level.., alpha=..level..), size=8, bins= 30, alpha = 0.5, geom = "polygon", data=uber_raw_data) + scale_fill_gradient(low="blue", high = "red")+facet_wrap(~ month) + labs(x="Longitude", y="Latitude", title = "Uber Pickup Density Graphic-Break Down By month", subtitle = "NYC - April/2014 ~ Sep/2014")
## Warning: Removed 496975 rows containing non-finite values (stat_density2d).
# Task2.3
# Similar treatment as Task2.1 and 2.2.
# select the data needed for this task
# why probematic?
#hourly_pickup <- subset(uber_raw_data, select = c(time, Lat, Lon))
#Change the time format to simply showing the hour
#Hour <- format(as.POSIXct(strptime(hourly_pickup$Time, "%H:%M:%S", tz="")), format="%H")
#hourly_pickup$time <- Hour
NYC + stat_density2d(aes(x =Lon, y =Lat,fill = ..level.., alpha=..level..), size=8, bins= 30, alpha = 0.5, geom = "polygon", data=uber_raw_data) + scale_fill_gradient(low="blue", high = "red")+facet_wrap(~ time) + labs(x="Longitude", y="Latitude", title = "Uber Pickup Density Graphic-Break Down over hours", subtitle = "NYC - April/2014 ~ Sep/2014")
## Warning: Removed 496975 rows containing non-finite values (stat_density2d).
The first question (Task 3.1) is very straightforward.
# Task 3.1
data(mtcars)
mtcars <- as.tibble(mtcars)
#summary(mtcars)
#head(mtcars)
typeof(mtcars)
## [1] "list"
# Number of automobiles
Num <- dim(mtcars)[1]
#colnames(mtcars)
#rownames(mtcars)
# Initialization
A <- matrix(0, Num, Num)
# Rename rows and cols
rownames(A) <- rownames(mtcars)
colnames(A) <- rownames(mtcars)
#A["Hornet 4 Drive","Valiant"]
# Assign the cosine.similarity value to the matrix.
for (x_name in rownames(mtcars)) {
for (y_name in rownames(mtcars)){
x <- as.numeric(mtcars[x_name,])
y <- as.numeric(mtcars[y_name,])
A[x_name,y_name] <- lsa::cosine(x,y)
}
}
summary(A)
## Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive
## Min. :0.9591 Min. :0.9594 Min. :0.9416 Min. :0.8867
## 1st Qu.:0.9787 1st Qu.:0.9786 1st Qu.:0.9591 1st Qu.:0.9477
## Median :0.9876 Median :0.9875 Median :0.9822 Median :0.9790
## Mean :0.9859 Mean :0.9859 Mean :0.9774 Mean :0.9674
## 3rd Qu.:0.9967 3rd Qu.:0.9968 3rd Qu.:0.9923 3rd Qu.:0.9959
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Hornet Sportabout Valiant Duster 360 Merc 240D
## Min. :0.9012 Min. :0.9010 Min. :0.9318 Min. :0.8867
## 1st Qu.:0.9567 1st Qu.:0.9577 1st Qu.:0.9776 1st Qu.:0.9583
## Median :0.9858 Median :0.9850 Median :0.9869 Median :0.9738
## Mean :0.9731 Mean :0.9731 Mean :0.9818 Mean :0.9662
## 3rd Qu.:0.9982 3rd Qu.:0.9970 3rd Qu.:0.9958 3rd Qu.:0.9866
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Merc 230 Merc 280 Merc 280C Merc 450SE
## Min. :0.9572 Min. :0.9593 Min. :0.9580 Min. :0.9366
## 1st Qu.:0.9791 1st Qu.:0.9735 1st Qu.:0.9735 1st Qu.:0.9809
## Median :0.9854 Median :0.9840 Median :0.9842 Median :0.9894
## Mean :0.9846 Mean :0.9851 Mean :0.9850 Mean :0.9835
## 3rd Qu.:0.9931 3rd Qu.:0.9968 3rd Qu.:0.9969 3rd Qu.:0.9971
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Merc 450SL Merc 450SLC Cadillac Fleetwood Lincoln Continental
## Min. :0.9375 Min. :0.9359 Min. :0.8783 Min. :0.8899
## 1st Qu.:0.9812 1st Qu.:0.9806 1st Qu.:0.9411 1st Qu.:0.9489
## Median :0.9894 Median :0.9895 Median :0.9762 Median :0.9811
## Mean :0.9837 Mean :0.9834 Mean :0.9640 Mean :0.9686
## 3rd Qu.:0.9972 3rd Qu.:0.9970 3rd Qu.:0.9972 3rd Qu.:0.9973
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Chrysler Imperial Fiat 128 Honda Civic Toyota Corolla
## Min. :0.9088 Min. :0.9101 Min. :0.9203 Min. :0.8870
## 1st Qu.:0.9611 1st Qu.:0.9351 1st Qu.:0.9422 1st Qu.:0.9151
## Median :0.9874 Median :0.9529 Median :0.9539 Median :0.9375
## Mean :0.9752 Mean :0.9579 Mean :0.9585 Mean :0.9451
## 3rd Qu.:0.9957 3rd Qu.:0.9800 3rd Qu.:0.9738 3rd Qu.:0.9758
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Toyota Corona Dodge Challenger AMC Javelin Camaro Z28
## Min. :0.9530 Min. :0.8965 Min. :0.9039 Min. :0.9329
## 1st Qu.:0.9686 1st Qu.:0.9539 1st Qu.:0.9588 1st Qu.:0.9763
## Median :0.9857 Median :0.9841 Median :0.9870 Median :0.9860
## Mean :0.9813 Mean :0.9715 Mean :0.9742 Mean :0.9815
## 3rd Qu.:0.9956 3rd Qu.:0.9986 3rd Qu.:0.9979 3rd Qu.:0.9958
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Pontiac Firebird Fiat X1-9 Porsche 914-2 Lotus Europa
## Min. :0.8839 Min. :0.9223 Min. :0.9578 Min. :0.8783
## 1st Qu.:0.9451 1st Qu.:0.9452 1st Qu.:0.9709 1st Qu.:0.9032
## Median :0.9787 Median :0.9634 Median :0.9863 Median :0.9576
## Mean :0.9665 Mean :0.9660 Mean :0.9824 Mean :0.9453
## 3rd Qu.:0.9977 3rd Qu.:0.9869 3rd Qu.:0.9957 3rd Qu.:0.9791
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Ford Pantera L Ferrari Dino Maserati Bora Volvo 142E
## Min. :0.9376 Min. :0.8867 Min. :0.8958 Min. :0.9395
## 1st Qu.:0.9720 1st Qu.:0.9118 1st Qu.:0.9252 1st Qu.:0.9568
## Median :0.9840 Median :0.9560 Median :0.9660 Median :0.9830
## Mean :0.9807 Mean :0.9467 Mean :0.9529 Mean :0.9762
## 3rd Qu.:0.9954 3rd Qu.:0.9701 3rd Qu.:0.9758 3rd Qu.:0.9905
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
To find the most similar automobile, we just to pick the pair with highest cosine similarity.
# Task 3.2
# Exclude the comparison of automobile with itself.
A <- A - diag(Num)
# Find the maximal similarity
pos <- which(A == max(A),TRUE)
# The most similar automobile
pos
## row col
## Merc 450SL 13 12
## Merc 450SE 12 13