Task 1: Data Analysis

Task 1.2 How did the use of Uber in NYC trend over the months considered in the dataset? Does it increase? Perform this analysis through visualization.

# Add a month column
uber_apr$month <- "Apr"
#uber_apr$month <- as.factor(uber_apr$month)
#head(uber_apr)
uber_may$month <- "May"
#uber_may$month <- as.factor(uber_may$month)

uber_jun$month <- "Jun"
#uber_jun$month <- as.factor(uber_jun$month)

uber_jul$month <- "Jul"
#uber_jul$month <- as.factor(uber_jul$month)

uber_aug$month <- "Aug"
#uber_aug$month <- as.factor(uber_aug$month)

uber_sep$month <- "Sep"
#uber_sep$month <- as.factor(uber_sep$month)


# Update the dataset
# the approach is problematic and non-bug-free
uber_raw_data <- rbind(uber_apr,uber_may,uber_jun,uber_jul,uber_aug,uber_sep)
#head(uber_raw_data)
#colnames(uber_raw_data)
#summary(uber_raw_data)
uber_raw_data$Weekday <- weekdays(as.Date(uber_raw_data$Date.Time,format="%m/%d/%Y"))


# Calculate the pickups of each month
pickup_monthly <- aggregate(uber_raw_data$month, FUN = length, by = list(uber_raw_data$month))


# Re-name columns
colnames(pickup_monthly) <- c("month", "pick_ups")

# Grouping
pickup_monthly$month <- as.factor(pickup_monthly$month)
#summary(pickup_monthly)


# Plot it in barchart
ggplt_month <- ggplot(pickup_monthly, aes(x=month, y=pick_ups, fill = pick_ups)) + 
  
  # barchart
  geom_bar(stat = "identity", position="dodge") + 
  
  # Add a title
  labs(title = "Uber Pick-Up Trends over the Months")

ggplt_month

The Uber pick-up frequencies from April to Spetember rise firsty and then fall, and at last rise again to the highest in September.

Task 2: Spatial Analysis Task

Task 2.1 Show how the number of pick-up vary per day of the week and according to location.

#Task 2.1
#We plot the data in density plot with 7 facet so that we can observe the variation among the weekdays. 
#plot the density plot on top of NYC map where x axis is longitude and y axis is latitude. Level are presented in color.
# One solution is use the package `ggmap`. And another elegant approach is use leaf


# Select the data needed for this task
#colnames(uber_raw_data)
#NYC_day_pickups <- subset(uber_raw_data, select = c(Lat, Lon, Weekday))
#
# reorder the data according to the weekdays
#NYC_day_pickups$Weekday<- factor(NYC_day_pickups$Weekday, levels=c("Monday",
 # "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday","Sunday"))
#NYC_day_pickups<-NYC_day_pickups[order(NYC_day_pickups$Weekday),]


#get the map of NYC. 
NYC_map <- get_map(location = "NYC", zoom = 12, maptype = "roadmap")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=NYC&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=NYC&sensor=false
#save the visual map
NYC <- ggmap(NYC_map)

# Plot the pickup data as point plot on top of the NYC map. 

NYC + stat_density2d(aes(x =Lon, y =Lat,fill = ..level.., alpha=..level..), size=8, bins= 30, alpha = 0.5, geom = "polygon", data=uber_raw_data) + scale_fill_gradient(low="blue", high = "red")+facet_wrap(~ Weekday) + labs(x="Longitude", y="Latitude", title = "Uber Pickup Density Graphic-Break Down By Weekday", subtitle = "NYC - April/2014 ~ Sep/2014")
## Warning: Removed 496975 rows containing non-finite values (stat_density2d).

Task 2.2 Show how the number of pick-up vary per month and according to location.

# Task2.2
# Similar to Task 2.1.


# focus on the data needed
#NYC_monthly_pickups <- subset(uber_raw_data, select = c(Lat, Lon, month))
#NYC_monthly_pickups$month<- factor(NYC_monthly_pickups$month, levels = c("April", "May", "June", "July","August","September"))
#NYC_monthly_pickups<-NYC_monthly_pickups[order(NYC_monthly_pickups$month),]


NYC + stat_density2d(aes(x =Lon, y = Lat,fill = ..level.., alpha=..level..), size=8, bins= 30, alpha = 0.5, geom = "polygon", data=uber_raw_data) + scale_fill_gradient(low="blue", high = "red")+facet_wrap(~ month) + labs(x="Longitude", y="Latitude", title = "Uber Pickup Density Graphic-Break Down By month", subtitle = "NYC - April/2014 ~ Sep/2014")
## Warning: Removed 496975 rows containing non-finite values (stat_density2d).

Task 2.3 Show how the number of pick-up vary per hour of the day and according to location.

# Task2.3
# Similar treatment as Task2.1 and 2.2.


# select the data needed for this task
# why probematic?
#hourly_pickup <- subset(uber_raw_data, select = c(time, Lat, Lon))

#Change the time format to simply showing the hour
#Hour <- format(as.POSIXct(strptime(hourly_pickup$Time, "%H:%M:%S", tz="")), format="%H")
#hourly_pickup$time <- Hour

NYC + stat_density2d(aes(x =Lon, y =Lat,fill = ..level.., alpha=..level..), size=8, bins= 30, alpha = 0.5, geom = "polygon", data=uber_raw_data) + scale_fill_gradient(low="blue", high = "red")+facet_wrap(~ time) + labs(x="Longitude", y="Latitude", title = "Uber Pickup Density Graphic-Break Down over hours", subtitle = "NYC - April/2014 ~ Sep/2014")
## Warning: Removed 496975 rows containing non-finite values (stat_density2d).

Task 3.

Task 3.1 is asking you to calculate the cosine similarity between every pair of automobiles if we represent each automobile as a vector of the above 11 variables in “mtcars”. This question requires you to create a matrix (e.g., A) to store the calculation results.

The first question (Task 3.1) is very straightforward.

# Task 3.1
data(mtcars)
mtcars <- as.tibble(mtcars)
#summary(mtcars)
#head(mtcars)

typeof(mtcars)
## [1] "list"
# Number of automobiles
Num <- dim(mtcars)[1] 
#colnames(mtcars)
#rownames(mtcars)

# Initialization
A <- matrix(0, Num, Num)

# Rename rows and cols
rownames(A) <- rownames(mtcars)
colnames(A) <- rownames(mtcars)
#A["Hornet 4 Drive","Valiant"]

# Assign the cosine.similarity value to the matrix.
for (x_name in rownames(mtcars)) {
  
  for (y_name in rownames(mtcars)){
    x <- as.numeric(mtcars[x_name,])
    y <- as.numeric(mtcars[y_name,])
    A[x_name,y_name] <- lsa::cosine(x,y)
  }
}

summary(A)
##    Mazda RX4      Mazda RX4 Wag      Datsun 710     Hornet 4 Drive  
##  Min.   :0.9591   Min.   :0.9594   Min.   :0.9416   Min.   :0.8867  
##  1st Qu.:0.9787   1st Qu.:0.9786   1st Qu.:0.9591   1st Qu.:0.9477  
##  Median :0.9876   Median :0.9875   Median :0.9822   Median :0.9790  
##  Mean   :0.9859   Mean   :0.9859   Mean   :0.9774   Mean   :0.9674  
##  3rd Qu.:0.9967   3rd Qu.:0.9968   3rd Qu.:0.9923   3rd Qu.:0.9959  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  Hornet Sportabout    Valiant         Duster 360       Merc 240D     
##  Min.   :0.9012    Min.   :0.9010   Min.   :0.9318   Min.   :0.8867  
##  1st Qu.:0.9567    1st Qu.:0.9577   1st Qu.:0.9776   1st Qu.:0.9583  
##  Median :0.9858    Median :0.9850   Median :0.9869   Median :0.9738  
##  Mean   :0.9731    Mean   :0.9731   Mean   :0.9818   Mean   :0.9662  
##  3rd Qu.:0.9982    3rd Qu.:0.9970   3rd Qu.:0.9958   3rd Qu.:0.9866  
##  Max.   :1.0000    Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     Merc 230         Merc 280        Merc 280C        Merc 450SE    
##  Min.   :0.9572   Min.   :0.9593   Min.   :0.9580   Min.   :0.9366  
##  1st Qu.:0.9791   1st Qu.:0.9735   1st Qu.:0.9735   1st Qu.:0.9809  
##  Median :0.9854   Median :0.9840   Median :0.9842   Median :0.9894  
##  Mean   :0.9846   Mean   :0.9851   Mean   :0.9850   Mean   :0.9835  
##  3rd Qu.:0.9931   3rd Qu.:0.9968   3rd Qu.:0.9969   3rd Qu.:0.9971  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##    Merc 450SL      Merc 450SLC     Cadillac Fleetwood Lincoln Continental
##  Min.   :0.9375   Min.   :0.9359   Min.   :0.8783     Min.   :0.8899     
##  1st Qu.:0.9812   1st Qu.:0.9806   1st Qu.:0.9411     1st Qu.:0.9489     
##  Median :0.9894   Median :0.9895   Median :0.9762     Median :0.9811     
##  Mean   :0.9837   Mean   :0.9834   Mean   :0.9640     Mean   :0.9686     
##  3rd Qu.:0.9972   3rd Qu.:0.9970   3rd Qu.:0.9972     3rd Qu.:0.9973     
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000     Max.   :1.0000     
##  Chrysler Imperial    Fiat 128       Honda Civic     Toyota Corolla  
##  Min.   :0.9088    Min.   :0.9101   Min.   :0.9203   Min.   :0.8870  
##  1st Qu.:0.9611    1st Qu.:0.9351   1st Qu.:0.9422   1st Qu.:0.9151  
##  Median :0.9874    Median :0.9529   Median :0.9539   Median :0.9375  
##  Mean   :0.9752    Mean   :0.9579   Mean   :0.9585   Mean   :0.9451  
##  3rd Qu.:0.9957    3rd Qu.:0.9800   3rd Qu.:0.9738   3rd Qu.:0.9758  
##  Max.   :1.0000    Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  Toyota Corona    Dodge Challenger  AMC Javelin       Camaro Z28    
##  Min.   :0.9530   Min.   :0.8965   Min.   :0.9039   Min.   :0.9329  
##  1st Qu.:0.9686   1st Qu.:0.9539   1st Qu.:0.9588   1st Qu.:0.9763  
##  Median :0.9857   Median :0.9841   Median :0.9870   Median :0.9860  
##  Mean   :0.9813   Mean   :0.9715   Mean   :0.9742   Mean   :0.9815  
##  3rd Qu.:0.9956   3rd Qu.:0.9986   3rd Qu.:0.9979   3rd Qu.:0.9958  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  Pontiac Firebird   Fiat X1-9      Porsche 914-2     Lotus Europa   
##  Min.   :0.8839   Min.   :0.9223   Min.   :0.9578   Min.   :0.8783  
##  1st Qu.:0.9451   1st Qu.:0.9452   1st Qu.:0.9709   1st Qu.:0.9032  
##  Median :0.9787   Median :0.9634   Median :0.9863   Median :0.9576  
##  Mean   :0.9665   Mean   :0.9660   Mean   :0.9824   Mean   :0.9453  
##  3rd Qu.:0.9977   3rd Qu.:0.9869   3rd Qu.:0.9957   3rd Qu.:0.9791  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  Ford Pantera L    Ferrari Dino    Maserati Bora      Volvo 142E    
##  Min.   :0.9376   Min.   :0.8867   Min.   :0.8958   Min.   :0.9395  
##  1st Qu.:0.9720   1st Qu.:0.9118   1st Qu.:0.9252   1st Qu.:0.9568  
##  Median :0.9840   Median :0.9560   Median :0.9660   Median :0.9830  
##  Mean   :0.9807   Mean   :0.9467   Mean   :0.9529   Mean   :0.9762  
##  3rd Qu.:0.9954   3rd Qu.:0.9701   3rd Qu.:0.9758   3rd Qu.:0.9905  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000

Task 3.2 is to find the most similar automobile (only one is OK) for each one and print the results as follows:

To find the most similar automobile, we just to pick the pair with highest cosine similarity.

# Task 3.2
# Exclude the comparison of automobile with itself.
A <- A - diag(Num)

# Find the maximal similarity
pos <- which(A == max(A),TRUE)
# The most similar automobile
pos
##            row col
## Merc 450SL  13  12
## Merc 450SE  12  13