setwd("C:/Users/jzchen/Documents/Courses/Analytics Edge/Unit_7_Visualization")
mvt <- read.csv("mvt.csv", stringsAsFactor = F)
str(mvt)
## 'data.frame': 191641 obs. of 3 variables:
## $ Date : chr "12/31/12 23:15" "12/31/12 22:00" "12/31/12 22:00" "12/31/12 22:00" ...
## $ Latitude : num 41.8 41.9 42 41.8 41.8 ...
## $ Longitude: num -87.6 -87.7 -87.8 -87.7 -87.6 ...
Convert the data variable
mvt$Date <- strptime(mvt$Date, format = "%m/%d/%y %H:%M")
mvt$Weekday <- weekdays(mvt$Date)
mvt$Hours <- mvt$Date$hour
str(mvt)
## 'data.frame': 191641 obs. of 5 variables:
## $ Date : POSIXlt, format: "2012-12-31 23:15:00" "2012-12-31 22:00:00" ...
## $ Latitude : num 41.8 41.9 42 41.8 41.8 ...
## $ Longitude: num -87.6 -87.7 -87.8 -87.7 -87.6 ...
## $ Weekday : chr "Monday" "Monday" "Monday" "Monday" ...
## $ Hours : int 23 22 22 22 21 20 20 20 19 18 ...
table(mvt$Weekday)
##
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## 29284 27397 27118 26316 27319 26791 27416
WeekdayCounts <- as.data.frame(table(mvt$Weekday))
str(WeekdayCounts)
## 'data.frame': 7 obs. of 2 variables:
## $ Var1: Factor w/ 7 levels "Friday","Monday",..: 1 2 3 4 5 6 7
## $ Freq: int 29284 27397 27118 26316 27319 26791 27416
library(ggplot2)
ggplot(WeekdayCounts, aes(x = Var1, y = Freq)) + geom_line((aes(group = 1)))
The aes argument just groups all of our data into one line, since we want one line in our plot.
We want the day of the week to be chronical order. We can do this by making the Var1 variable an ordered factor variable. This signals to ggplot that the ordering is meaningful.
WeekdayCounts$Var1 <- factor(WeekdayCounts$Var1, ordered = T, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
ggplot(WeekdayCounts, aes(x = Var1, y = Freq)) + geom_line(aes(group = 1))
Change the x y axis
ggplot(WeekdayCounts, aes(x = Var1, y = Freq)) + geom_line((aes(group = 1))) + xlab("Day of the week") + ylab("Total motor vehicle thefts")
table(mvt$Weekday, mvt$Hours)
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## Friday 1873 932 743 560 473 602 839 1203 1268 1286 938 822
## Monday 1900 825 712 527 415 542 772 1123 1323 1235 971 737
## Saturday 2050 1267 985 836 652 508 541 650 858 1039 946 789
## Sunday 2028 1236 1019 838 607 461 478 483 615 864 884 787
## Thursday 1856 816 696 508 400 534 799 1135 1298 1301 932 731
## Tuesday 1691 777 603 464 414 520 845 1118 1175 1174 948 786
## Wednesday 1814 790 619 469 396 561 862 1140 1329 1237 947 763
##
## 12 13 14 15 16 17 18 19 20 21 22 23
## Friday 1207 857 937 1140 1165 1318 1623 1652 1736 1881 2308 1921
## Monday 1129 824 958 1059 1136 1252 1518 1503 1622 1815 2009 1490
## Saturday 1204 767 963 1086 1055 1084 1348 1390 1570 1702 2078 1750
## Sunday 1192 789 959 1037 1083 1160 1389 1342 1706 1696 2079 1584
## Thursday 1093 752 831 1044 1131 1258 1510 1537 1668 1776 2134 1579
## Tuesday 1108 762 908 1071 1090 1274 1553 1496 1696 1816 2044 1458
## Wednesday 1225 804 863 1075 1076 1289 1580 1507 1718 1748 2093 1511
DayHourCount <- as.data.frame(table(mvt$Weekday, mvt$Hours))
str(DayHourCount)
## 'data.frame': 168 obs. of 3 variables:
## $ Var1: Factor w/ 7 levels "Friday","Monday",..: 1 2 3 4 5 6 7 1 2 3 ...
## $ Var2: Factor w/ 24 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ Freq: int 1873 1900 2050 2028 1856 1691 1814 932 825 1267 ...
We can see that we have 168 observations– one for each day of the week and hour pair, and three different variables.
Convert the hour to numerical variable
DayHourCount$Hour <- as.numeric(as.character(DayHourCount$Var2))
This is how we convert a factor variable to a numeric variable.
ggplot(DayHourCount, aes(x = Hour, y = Freq)) + geom_line(aes(group = Var1))
let’s change the colors of the lines to correspond to the days of the week.
ggplot(DayHourCount, aes(x = Hour, y = Freq)) + geom_line(aes(group = Var1, color = Var1), size = 2)
we need to fix the order of the days so that they’ll show up in chronological order instead of in alphabetical order.
DayHourCount$Var1 <- factor(DayHourCount$Var1, ordered = T, levels = c("Monday", "Tuesday", "Wednesday", "Thursday","Friday", "Saturday","Sunday"))
ggplot(DayHourCount, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq))
We can change the label on the legend, and get rid of the y label to make our plot a little nicer.
ggplot(DayHourCount, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq)) + scale_fill_gradient(name = "Total MV Thefts") + theme(axis.title.y = element_blank())
scale_fill_gradient defines properties of the legend. theme argument is what you can do if you want to get rid of one of the axis labels.
We can also change the color scheme
ggplot(DayHourCount, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq)) + scale_fill_gradient(name = "Total MV Thefts", low = "white", high = "red") + theme(axis.title.y = element_blank())
library(maps)
library(ggmap)
chicago <- get_map(location = "chicago", zoom = 11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=chicago&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=chicago&sensor=false
ggmap(chicago)
Now let’s plot the first 100 motor vehicle thefts in our data set on this map.
ggmap(chicago) + geom_point(data = mvt[1:100,], aes(x = Longitude, y = Latitude))
## Warning in loop_apply(n, do.ply): Removed 7 rows containing missing values
## (geom_point).
If we plotted all 190,000 motor vehicle thefts, we would just see a big black box, which wouldn’t be helpful at all. We’re more interested in whether or not an area has a high amount of crime, so let’s round our latitude and longitude to two digits of accuracy and create a crime counts data frame for each area.
LatLonCounts <- as.data.frame(table(round(mvt$Longitude,2), round(mvt$Latitude,2)))
str(LatLonCounts)
## 'data.frame': 1638 obs. of 3 variables:
## $ Var1: Factor w/ 42 levels "-87.93","-87.92",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Var2: Factor w/ 39 levels "41.64","41.65",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq: int 0 0 0 0 0 0 0 0 0 0 ...
Convert our longitude and latitude variables to numbers
LatLonCounts$Long <- as.numeric(as.character(LatLonCounts$Var1))
LatLonCounts$Lat <- as.numeric(as.character(LatLonCounts$Var2))
Now, let’s plot these points on our map, making the size and color of the points depend on the total number of motor vehicle thefts.
ggmap(chicago) + geom_point(data = LatLonCounts, aes(x = Long, y = Lat, color = Freq, size = Freq))
## Warning in loop_apply(n, do.ply): Removed 615 rows containing missing
## values (geom_point).
Now our plot should have a point for every area defined by our latitude and longitude areas, and the points have a size and color corresponding to the number of crimes in that area.
Change the color scheme
ggmap(chicago) + geom_point(data = LatLonCounts, aes(x = Long, y = Lat, color = Freq, size = Freq)) + scale_color_gradient(low = "yellow", high = "red")
## Warning in loop_apply(n, do.ply): Removed 615 rows containing missing
## values (geom_point).
Create heatmap
ggmap(chicago) + geom_tile(data = LatLonCounts, aes(x = Long, y = Lat, alpha = Freq), fill = "red")
alpha will define how to scale the colors on the heat map according to the crime counts.
Our heatmap was plotting squares out in the water, which seems a little strange. We can fix this by removing the observations from our data frame that have Freq = 0.
LatLonCounts2 <- subset(LatLonCounts, Freq > 0)
ggmap(chicago) + geom_tile(data = LatLonCounts2, aes(x = Long, y = Lat, alpha = Freq), fill = "red")
murders <- read.csv("murders.csv")
str(murders)
## 'data.frame': 51 obs. of 6 variables:
## $ State : Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Population : int 4779736 710231 6392017 2915918 37253956 5029196 3574097 897934 601723 19687653 ...
## $ PopulationDensity: num 94.65 1.26 57.05 56.43 244.2 ...
## $ Murders : int 199 31 352 130 1811 117 131 48 131 987 ...
## $ GunMurders : int 135 19 232 93 1257 65 97 38 99 669 ...
## $ GunOwnership : num 0.517 0.578 0.311 0.553 0.213 0.347 0.167 0.255 0.036 0.245 ...
statesMap <- map_data("state")
str(statesMap)
## 'data.frame': 15537 obs. of 6 variables:
## $ long : num -87.5 -87.5 -87.5 -87.5 -87.6 ...
## $ lat : num 30.4 30.4 30.4 30.3 30.3 ...
## $ group : num 1 1 1 1 1 1 1 1 1 1 ...
## $ order : int 1 2 3 4 5 6 7 8 9 10 ...
## $ region : chr "alabama" "alabama" "alabama" "alabama" ...
## $ subregion: chr NA NA NA NA ...
ggplot(statesMap, aes(x = long, y = lat, group = group)) + geom_polygon(fill = "white", color = "black")
group is the variable defining how to draw the United States into groups by state.
Before we can plot our data on this map, we need to make sure that the state names are the same in the murders data frame and in the statesMap data frame. In the murders data frame, our state names are in the State variable, and they start with a capital letter. But in the statesMap data frame, our state names are in the region variable, and they’re all lowercase.
murders$region <- tolower(murders$State)
Now we can join the statesMap data frame with the murders data frame by using the merge function, which matches rows of a data frame based on a shared identifier.
murderMap <- merge(statesMap, murders, by = "region")
str(murderMap)
## 'data.frame': 15537 obs. of 12 variables:
## $ region : chr "alabama" "alabama" "alabama" "alabama" ...
## $ long : num -87.5 -87.5 -87.5 -87.5 -87.6 ...
## $ lat : num 30.4 30.4 30.4 30.3 30.3 ...
## $ group : num 1 1 1 1 1 1 1 1 1 1 ...
## $ order : int 1 2 3 4 5 6 7 8 9 10 ...
## $ subregion : chr NA NA NA NA ...
## $ State : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Population : int 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 ...
## $ PopulationDensity: num 94.7 94.7 94.7 94.7 94.7 ...
## $ Murders : int 199 199 199 199 199 199 199 199 199 199 ...
## $ GunMurders : int 135 135 135 135 135 135 135 135 135 135 ...
## $ GunOwnership : num 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 ...
Make the plot
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = Murders)) + geom_polygon(color = "black") + scale_fill_gradient(low = "black", high = "red", guide = "legend")
It looks like California and Texas have the largest number of murders. But is that just because they’re the most populous states? Let’s create a map of the population of each state to check.
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = Population)) + geom_polygon(color = "black") + scale_fill_gradient(low = "black", high = "red", guide = "legend")
We have a population map here which looks exactly the same as our murders map. So we need to plot the murder rate instead of the number of murders to make sure we’re not just plotting a population map.
murderMap$MurderRate <- murderMap$Murders/murderMap$Population * 100000
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = MurderRate)) + geom_polygon(color = "black") + scale_fill_gradient(low = "black", high = "red", guide = "legend")
There aren’t really any red states. Why? It turns out that Washington, DC is an outlier with a very high murder rate, but it’s such a small region on the map that we can’t even see it.
So let’s redo our plot, removing any observations with murder rates above 10, which we know will only exclude Washington, DC.
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = MurderRate)) + geom_polygon(color = "black") + scale_fill_gradient(low = "black", high = "red", guide = "legend", limits = c(0,10))
Redo the map, but this time fill each state with the variable GunOwnership. This shows the percentage of people in each state who own a gun.
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = GunOwnership)) + geom_polygon(color = "black") + scale_fill_gradient(low = "black", high = "red", guide = "legend", limits = c(0,1))