library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.2
# install.packages("maps")
library(ggvis)
## Warning: package 'ggvis' was built under R version 3.1.1
##
## Attaching package: 'ggvis'
##
## The following object is masked from 'package:ggplot2':
##
## resolution
library(reshape2)
library(maps)
## Warning: package 'maps' was built under R version 3.1.1
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.1.1
library(maps)
library(maptools)
## Warning: package 'maptools' was built under R version 3.1.1
## Loading required package: sp
## Warning: package 'sp' was built under R version 3.1.1
## Checking rgeos availability: FALSE
## Note: when rgeos is not available, polygon geometry computations in maptools depend on gpclib,
## which has a restricted licence. It is disabled by default;
## to enable gpclib, type gpclibPermit()
library(mapproj)
## Warning: package 'mapproj' was built under R version 3.1.1
# Safai video: ggplot2
head(diamonds)
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
tail(diamonds)
## carat cut color clarity depth table price x y z
## 53935 0.72 Premium D SI1 62.7 59 2757 5.69 5.73 3.58
## 53936 0.72 Ideal D SI1 60.8 57 2757 5.75 5.76 3.50
## 53937 0.72 Good D SI1 63.1 55 2757 5.69 5.75 3.61
## 53938 0.70 Very Good D SI1 62.8 60 2757 5.66 5.68 3.56
## 53939 0.86 Premium H SI2 61.0 58 2757 6.15 6.12 3.74
## 53940 0.75 Ideal D SI2 62.2 55 2757 5.83 5.87 3.64
small <- diamonds[sample(nrow(diamonds), 1000),]
ggplot(small) + geom_point(aes(x=carat, y=price)) + ggtitle('example 01')
ggplot(small) + geom_point(aes(x=carat, y=price, color = cut )) + ggtitle('example 02')
ggplot(small) + geom_point(aes(x=carat, y=price, color = cut )) + ggtitle('example 02')
ggplot(small, aes(x=carat, y=price, )) + geom_point(aes(color=cut)) + ggtitle('example 01')
ggplot(small, aes(x=carat, y=price, color=cut)) + geom_point() + ggtitle('example 02')
##########################################
### PLOTTING WITH BASE R
##########################################
#EXAMPLE 5.1
# FORMAT plot(x,y, type=âlâ) # line
plot(faithful, type='p')
# My check with the data set, in order to know why
names(faithful)
## [1] "eruptions" "waiting"
head(faithful)
## eruptions waiting
## 1 3.600 79
## 2 1.800 54
## 3 3.333 74
## 4 2.283 62
## 5 4.533 85
## 6 2.883 55
dim(faithful)
## [1] 272 2
#Connect each x,y point by a line.
plot(faithful, type='l') #This plot doesn't make much sense, just for illustration
# EXAMPLE 5.2
# PLOTTING CATEGORICAL VARIABLES
#First, let's try:
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
dim(iris)
## [1] 150 5
plot(iris$Species)
#' Notice that it became a bar plot
#' Any 'factor' when plotted becomes a bar chart
library(ggplot2)
str(movies$mpaa)
## Factor w/ 5 levels "","NC-17","PG",..: 1 1 1 1 1 1 5 1 1 1 ...
plot(movies[, "mpaa"])
# Excluding the movies without ratings
mpaa.movies <- subset(movies, mpaa != "")
plot(mpaa.movies[, "mpaa"])
# Check dataset of "mtcars"
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
dim(mtcars)
## [1] 32 11
# Plot
barplot(mtcars[ , "mpg"])
plot(mtcars[ , "mpg"])
# EXAMPLE 5.3
#Adding title/labels
plot(faithful, ann=FALSE) #get rid of the current labels
title(main="The Title", xlab="X Axis Label", ylab="Y Axis Label")
# EXAMPLE 5.4
#You can take any numerical vector and plot a histogram very quickly.
#The Y-axis is the frequency
names(faithful)
## [1] "eruptions" "waiting"
hist(faithful$waiting)
hist(faithful$eruptions)
pairs(faithful)
# P A I R S
#' @description Use pairs() to quickly produce a complete matrix of scatterplots
#'
# EXAMPLE 5.5
pairs(iris)
pairs(iris[,-5]) # everything except the 5th column
#Example 5.6
#Let's jump right in and create a few plots
names(movies)
## [1] "title" "year" "length" "budget" "rating"
## [6] "votes" "r1" "r2" "r3" "r4"
## [11] "r5" "r6" "r7" "r8" "r9"
## [16] "r10" "mpaa" "Action" "Animation" "Comedy"
## [21] "Drama" "Documentary" "Romance" "Short"
head(movies)
## title year length budget rating votes r1 r2 r3
## 1 $ 1971 121 NA 6.4 348 4.5 4.5 4.5
## 2 $1000 a Touchdown 1939 71 NA 6.0 20 0.0 14.5 4.5
## 3 $21 a Day Once a Month 1941 7 NA 8.2 5 0.0 0.0 0.0
## 4 $40,000 1996 70 NA 8.2 6 14.5 0.0 0.0
## 5 $50,000 Climax Show, The 1975 71 NA 3.4 17 24.5 4.5 0.0
## 6 $pent 2000 91 NA 4.3 45 4.5 4.5 4.5
## r4 r5 r6 r7 r8 r9 r10 mpaa Action Animation Comedy Drama
## 1 4.5 14.5 24.5 24.5 14.5 4.5 4.5 0 0 1 1
## 2 24.5 14.5 14.5 14.5 4.5 4.5 14.5 0 0 1 0
## 3 0.0 0.0 24.5 0.0 44.5 24.5 24.5 0 1 0 0
## 4 0.0 0.0 0.0 0.0 0.0 34.5 45.5 0 0 1 0
## 5 14.5 14.5 4.5 0.0 0.0 0.0 24.5 0 0 0 0
## 6 14.5 14.5 14.5 4.5 4.5 14.5 14.5 0 0 0 1
## Documentary Romance Short
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
dim(movies)
## [1] 58788 24
str(movies)
## 'data.frame': 58788 obs. of 24 variables:
## $ title : chr "$" "$1000 a Touchdown" "$21 a Day Once a Month" "$40,000" ...
## $ year : int 1971 1939 1941 1996 1975 2000 2002 2002 1987 1917 ...
## $ length : int 121 71 7 70 71 91 93 25 97 61 ...
## $ budget : int NA NA NA NA NA NA NA NA NA NA ...
## $ rating : num 6.4 6 8.2 8.2 3.4 4.3 5.3 6.7 6.6 6 ...
## $ votes : int 348 20 5 6 17 45 200 24 18 51 ...
## $ r1 : num 4.5 0 0 14.5 24.5 4.5 4.5 4.5 4.5 4.5 ...
## $ r2 : num 4.5 14.5 0 0 4.5 4.5 0 4.5 4.5 0 ...
## $ r3 : num 4.5 4.5 0 0 0 4.5 4.5 4.5 4.5 4.5 ...
## $ r4 : num 4.5 24.5 0 0 14.5 14.5 4.5 4.5 0 4.5 ...
## $ r5 : num 14.5 14.5 0 0 14.5 14.5 24.5 4.5 0 4.5 ...
## $ r6 : num 24.5 14.5 24.5 0 4.5 14.5 24.5 14.5 0 44.5 ...
## $ r7 : num 24.5 14.5 0 0 0 4.5 14.5 14.5 34.5 14.5 ...
## $ r8 : num 14.5 4.5 44.5 0 0 4.5 4.5 14.5 14.5 4.5 ...
## $ r9 : num 4.5 4.5 24.5 34.5 0 14.5 4.5 4.5 4.5 4.5 ...
## $ r10 : num 4.5 14.5 24.5 45.5 24.5 14.5 14.5 14.5 24.5 4.5 ...
## $ mpaa : Factor w/ 5 levels "","NC-17","PG",..: 1 1 1 1 1 1 5 1 1 1 ...
## $ Action : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Animation : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Comedy : int 1 1 0 1 0 0 0 0 0 0 ...
## $ Drama : int 1 0 0 0 0 1 1 0 1 0 ...
## $ Documentary: int 0 0 0 0 0 0 0 1 0 0 ...
## $ Romance : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Short : int 0 0 1 0 0 0 0 1 0 0 ...
# Let's try different Y-axis variables
ggplot(movies, aes(x=year, y=budget)) + geom_point()
## Warning: Removed 53573 rows containing missing values (geom_point).
ggplot(movies, aes(x=year, y=rating)) + geom_point()
ggplot(movies, aes(x=year, y=length)) + geom_point()
ggplot(movies, aes(x=year, y=votes)) + geom_point()
# Now, we are switching to Bar Plots. Notice the use of geom_bar()
ggplot(movies, aes(x=year)) + geom_bar()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Warning: position_stack requires constant width: output may be incorrect
#It is usually a good idea to control the binwidth ourselves
# Let's do it by decades...
ggplot(movies, aes(x=year)) + geom_bar(binwidth=10)
#We want to can color the bars by any variable.
#Here we are coloring by movies$mpaa. Notice the use of fill
ggplot(movies, aes(x=year, fill=mpaa)) + geom_bar(binwidth=10)
#----------------------------------------------------------
library(ggvis)
mtcars %>% ggvis(x = ~wt, y = ~mpg) %>% layer_points()
mtcars %>% ggvis(x = ~wt, y = ~mpg, stroke := "red") %>% layer_points()
#----------------------------------------------------------
#Example 5.7
## BUILDING A PLOT ITERATIVELY
## AVG MOVIE RATINGS BY YEAR
ggplot(movies, aes(x=year, y=rating, color=rating)) + geom_point()
p <- ggplot(movies, aes(x=year, y=rating, color=factor(rating))) + geom_point()
p
#Let's try it with dplyr
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.1.1
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dplot <- movies %>%
group_by(year) %>%
summarize(yearly_rating = mean(rating)) %>%
ggplot(aes(x=year, y=yearly_rating, color=yearly_rating)) + geom_point()
dplot
#Let's fix the Y-axis scale so that it goes from 0 to 10.
dp2 <- dplot + scale_y_continuous(limits=c(0,10))
dp2
#' Example 5.7A
#' How to create the same plot using ggVis
#' Using ggVis
library(ggvis)
library(dplyr)
movies %>%
group_by(year) %>%
summarize(yearly_rating = mean(rating)) %>%
ggvis(x= ~year, y= ~yearly_rating, stroke :='red') %>%
layer_points() %>%
scale_numeric("y", domain= c(1,10))
#Example 5.7B -Alternative Way to summarize using the very flexible stat_summary option
#' This is a powerful way to transform and plot.
#' Note that we didn't have to create a new data frame
ggplot(movies, aes(x=year, y=rating)) + stat_summary(fun.y=mean, geom="point")
#Example 5.8
p <- ggplot(movies, aes(x=year, y=rating, color=mpaa)) + geom_point() # instance of âmappingâ (inside aes)
p
q <- ggplot(movies, aes(x=year, y=rating), color=mpaa) + geom_point() #an instance of âsettingâ (outside aes)
q
##########
#IN CLASS EXERCISE
#Using mtcars
##########
#Example 5.9.
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
base <- ggplot(data = mtcars)
#Try executing these commands and watch how the plot changes...
#Layer by layer, we are making the plot more complex.
base + geom_point(aes(x=cyl, mpg))
base + geom_point(aes(x=cyl, y=mpg, color=gear))
base + geom_point(aes(x=cyl, y=mpg, color=factor(gear)))
base + geom_point(aes(x=cyl, y=mpg, color=factor(gear), size=hp))
base + geom_point(aes(x=cyl, y=mpg, color=factor(gear), size=hp, shape=factor(carb)))
#Example 5.9a
#Subsetting the data and plotting only what we are interested in
#Say we are only interestd in Mercedes cars in the dataset
tf <- grepl("Merc", row.names(mtcars))
mtcars[tf,]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Merc 240D 24.4 4 146.7 62 3.69 3.19 20.0 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.15 22.9 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.44 18.3 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.44 18.9 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.07 17.4 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.73 17.6 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.78 18.0 0 0 3 3
ggplot(mtcars[tf,])+ geom_point(aes(x=cyl, y=mpg, color=factor(gear), size=hp, shape=factor(carb)))
##### Barplot vs Histograms
#Example 5.10
library(ggplot2)
str(quakes)
## 'data.frame': 1000 obs. of 5 variables:
## $ lat : num -20.4 -20.6 -26 -18 -20.4 ...
## $ long : num 182 181 184 182 182 ...
## $ depth : int 562 650 42 626 649 195 82 194 211 622 ...
## $ mag : num 4.8 4.2 5.4 4.1 4 4 4.8 4.4 4.7 4.3 ...
## $ stations: int 41 15 43 19 11 12 43 15 35 19 ...
ggplot(quakes, aes(x=depth)) + geom_bar()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
hgram <- ggplot(quakes, aes(x=depth)) + geom_bar(binwidth=50)
hgram
#Example 5.11
#Let's plot a BAR PLOT instead
#' We should use stat="identity"
quakes_agg <- aggregate(mag ~ round(depth, -2), data=quakes, FUN=length) #rounding to 100s
str(quakes_agg) #only 8 rows
## 'data.frame': 8 obs. of 2 variables:
## $ round(depth, -2): num 0 100 200 300 400 500 600 700
## $ mag : int 77 271 159 66 55 142 220 10
names(quakes_agg) <- c("depth", "mag")
bplot <- ggplot(quakes_agg) + geom_bar(aes(x=depth, y=mag), stat="identity")
bplot
######################################
#Titles
#Example 5.12
names(faithful)
## [1] "eruptions" "waiting"
p <- ggplot(faithful,aes(x=waiting, y=eruptions)) + geom_point()
p
#add a title
p + labs(title="Waiting Time Between Eruptions")
#change the Y-axis title
p + ylab("Time between eruptions")
#Hide (supress) the title for the X-axis
p + xlab("")
#Say we want the x-axis text ever 5 minutes (instead of 10)
p + scale_x_continuous(limits=c(30, 110), breaks=seq(40,100,by=5))
#Example 5.13
p <- ggplot(movies, aes(x=year, y=rating, color=rating)) + geom_point()
p <- p + scale_color_continuous(low="red", high="green")
p
#Example 5.14
plot <- ggplot(mtcars, aes(x=cyl, y=mpg, fill=factor(gear))) + geom_bar(stat="identity")
plot + scale_fill_discrete(name="")
plot + scale_fill_discrete(name="Num Gears")
plot + guides(fill=FALSE)
# DO Quakes with higher Depth come with higher Magnitude?
#Example 5.15
head(quakes)
## lat long depth mag stations
## 1 -20.42 181.6 562 4.8 41
## 2 -20.62 181.0 650 4.2 15
## 3 -26.00 184.1 42 5.4 43
## 4 -17.97 181.7 626 4.1 19
## 5 -20.42 182.0 649 4.0 11
## 6 -19.68 184.3 195 4.0 12
head(cut(quakes$depth, c(0, 200, 400, 800)) )
## [1] (400,800] (400,800] (0,200] (400,800] (400,800] (0,200]
## Levels: (0,200] (200,400] (400,800]
quakes$depth.group
## NULL
ggplot(quakes, aes(x=depth, y=mag)) + geom_boxplot()
#Lett's create a new group, based on quake depth
quakes$depth.group <- cut(quakes$depth, c(0, 200, 400, 800))
ggplot(quakes, aes(x=depth, y=mag, fill=depth.group)) + geom_boxplot()
## Warning: position_dodge requires constant width: output may be incorrect
# USING dplyr (the same example 5.16)
quakes %>%
mutate(depth_group = cut(depth, c(0,200,400,800))) %>%
ggplot(., aes(x=depth, y=mag, fill=depth_group)) + geom_boxplot()
## Warning: position_dodge requires constant width: output may be incorrect
##--------------------------------------------------------------------------##
#Example 5.16 (Slide is at the end of the deck)
head(diamonds)
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
h <- ggplot(diamonds[1:1000,], aes(x=cut)) + geom_bar(); h #simple histogram
h <- ggplot(diamonds[1:1000,], aes(x=cut, fill=color)) + geom_bar(); h # stacked by default
h <- ggplot(diamonds[1:1000,], aes(x=cut, fill=color)) + geom_bar(position="dodge") #unstack
h
#Example 5.17
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
#add a new column to the data.frame
iris2 <- ddply(iris, "Species", transform, avg.petal = mean(Petal.Length))
#use the new column to show height
ggplot(iris2) + geom_bar(aes(x=Species, y=avg.petal)) # This one has error!!
### FACETS
#Example 5.18
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
base_g <- ggplot(mtcars, aes(x=hp, y=mpg)) + geom_point()
base_g
base_g + facet_grid(. ~ cyl) #side-by-side comparisons
base_g + facet_grid(gear ~ .) # top-to-bottom comparisons
base_g + facet_grid(gear ~ cyl)
##SECOND EXAMPLE TO UNDERSTAND FACETS
# This time we use the diamonds data set
#Example 5.19 -------------------------------------
unique(diamonds$cut)
## [1] Ideal Premium Good Very Good Fair
## Levels: Fair < Good < Very Good < Premium < Ideal
unique(diamonds$clarity)
## [1] SI2 SI1 VS1 VS2 VVS2 VVS1 I1 IF
## Levels: I1 < SI2 < SI1 < VS2 < VS1 < VVS2 < VVS1 < IF
names(diamonds)
## [1] "carat" "cut" "color" "clarity" "depth" "table" "price"
## [8] "x" "y" "z"
dim(diamonds)
## [1] 53940 10
base_g <- ggplot(diamonds[1:1000,]) + geom_point(aes(x=carat, y=price))
base_g
g1 <- base_g + facet_grid(. ~ cut) #horizontal stacking
g2 <- base_g + facet_grid(cut ~ .) #vertical stacking
g3 <- base_g + facet_grid(clarity ~ cut) #vertical stacking
#We did not discuss wrap in class, but good to know
g_wrap <- g1 + facet_wrap(clarity ~ cut) #good for exploring, to search for interesting cases
g_wrap
library(ggplot2)
movieGenres <- movies[c(18:23)] #subset to 6 genres. We are only interested in genres
head(movies[c(18:23)])
## Action Animation Comedy Drama Documentary Romance
## 1 0 0 1 1 0 0
## 2 0 0 1 0 0 0
## 3 0 1 0 0 0 0
## 4 0 0 1 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 1 0 0
head(movieGenres)
## Action Animation Comedy Drama Documentary Romance
## 1 0 0 1 1 0 0
## 2 0 0 1 0 0 0
## 3 0 1 0 0 0 0
## 4 0 0 1 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 1 0 0
tail(movies, 20)
## title year length budget rating votes r1 r2
## 58769 Zzikhimyeon jukneunda 2000 94 NA 4.7 45 14.5 4.5
## 58770 Zzim 1998 101 NA 4.5 14 14.5 4.5
## 58771 Zzyzx 2005 90 1000000 8.0 10 24.5 0.0
## 58772 alaska.de 2000 89 NA 6.2 186 4.5 4.5
## 58773 barefootin' 1987 3 NA 5.7 6 0.0 0.0
## 58774 deadend.com 2002 120 NA 6.9 53 64.5 4.5
## 58775 e-Dreams 2001 94 NA 6.8 86 4.5 0.0
## 58776 eMale 2001 17 NA 7.3 15 0.0 0.0
## 58777 eRATicate 2003 9 NA 6.0 5 0.0 0.0
## 58778 eXXXorcismos 2002 78 NA 4.2 11 34.5 0.0
## 58779 eXistenZ 1999 97 NA 6.7 14742 4.5 4.5
## 58780 f2point8 2002 20 NA 5.0 13 4.5 0.0
## 58781 f8 2001 13 NA 7.6 7 0.0 0.0
## 58782 pURe kILLjoy 1998 87 NA 5.2 6 0.0 14.5
## 58783 sIDney 2002 15 NA 7.0 8 14.5 0.0
## 58784 tom thumb 1958 98 NA 6.5 274 4.5 4.5
## 58785 www.XXX.com 2003 105 NA 1.1 12 45.5 0.0
## 58786 www.hellssoapopera.com 1999 100 NA 6.6 5 24.5 0.0
## 58787 xXx 2002 132 85000000 5.5 18514 4.5 4.5
## 58788 xXx: State of the Union 2005 101 87000000 3.9 1584 24.5 4.5
## r3 r4 r5 r6 r7 r8 r9 r10 mpaa Action Animation
## 58769 4.5 24.5 24.5 14.5 4.5 4.5 0.0 4.5 0 0
## 58770 24.5 4.5 14.5 24.5 0.0 4.5 4.5 0.0 0 0
## 58771 0.0 0.0 0.0 0.0 0.0 0.0 24.5 64.5 0 0
## 58772 4.5 4.5 4.5 14.5 14.5 24.5 14.5 4.5 0 0
## 58773 0.0 14.5 45.5 0.0 34.5 0.0 0.0 0.0 0 1
## 58774 0.0 4.5 4.5 0.0 4.5 4.5 4.5 4.5 0 0
## 58775 0.0 4.5 4.5 14.5 14.5 34.5 4.5 14.5 0 0
## 58776 0.0 4.5 4.5 4.5 0.0 24.5 24.5 24.5 0 0
## 58777 0.0 0.0 0.0 44.5 0.0 0.0 24.5 44.5 0 0
## 58778 0.0 0.0 0.0 4.5 0.0 14.5 4.5 24.5 0 0
## 58779 4.5 4.5 4.5 14.5 24.5 24.5 14.5 4.5 R 0 0
## 58780 4.5 4.5 14.5 0.0 14.5 4.5 14.5 24.5 0 0
## 58781 0.0 0.0 0.0 24.5 14.5 24.5 24.5 0.0 0 1
## 58782 14.5 14.5 0.0 34.5 0.0 0.0 0.0 14.5 0 0
## 58783 0.0 14.5 0.0 0.0 24.5 14.5 14.5 24.5 1 0
## 58784 4.5 4.5 14.5 14.5 24.5 14.5 4.5 4.5 0 1
## 58785 0.0 0.0 0.0 0.0 24.5 0.0 0.0 24.5 0 0
## 58786 24.5 0.0 0.0 0.0 0.0 0.0 24.5 44.5 0 0
## 58787 4.5 4.5 14.5 14.5 14.5 14.5 4.5 4.5 PG-13 1 0
## 58788 4.5 4.5 4.5 14.5 4.5 4.5 4.5 14.5 PG-13 1 0
## Comedy Drama Documentary Romance Short
## 58769 0 0 0 0 0
## 58770 0 0 0 0 0
## 58771 0 0 0 0 0
## 58772 0 1 0 0 0
## 58773 0 0 0 0 1
## 58774 0 1 0 0 0
## 58775 0 0 1 0 0
## 58776 1 0 0 0 1
## 58777 1 0 0 0 1
## 58778 0 1 0 0 0
## 58779 0 1 0 0 0
## 58780 0 0 0 0 1
## 58781 0 0 0 0 1
## 58782 0 0 0 0 0
## 58783 0 0 0 0 1
## 58784 0 0 0 0 0
## 58785 0 1 0 1 0
## 58786 0 0 0 0 0
## 58787 0 0 0 0 0
## 58788 0 0 0 0 0
cor(movieGenres) # 6x6 cor matrix
## Action Animation Comedy Drama Documentary Romance
## Action 1.00000 -0.05443 -0.08289 0.00776 -0.06949 -0.02336
## Animation -0.05443 1.00000 0.17967 -0.17916 -0.05204 -0.06637
## Comedy -0.08289 0.17967 1.00000 -0.25578 -0.14084 0.10986
## Drama 0.00776 -0.17916 -0.25578 1.00000 -0.17344 0.10355
## Documentary -0.06949 -0.05204 -0.14084 -0.17344 1.00000 -0.07158
## Romance -0.02336 -0.06637 0.10986 0.10355 -0.07158 1.00000
#It is much easier to plot when there is one data point per row
library(reshape2)
#for that, we melt the data.
mdf <-melt(cor(movieGenres))
names(mdf) # 'melt' automatically creates columns Var1, Var2, value
## [1] "Var1" "Var2" "value"
head(mdf)
## Var1 Var2 value
## 1 Action Action 1.00000
## 2 Animation Action -0.05443
## 3 Comedy Action -0.08289
## 4 Drama Action 0.00776
## 5 Documentary Action -0.06949
## 6 Romance Action -0.02336
# Book: Intro Stat with R, P140
library(ISwR)
## Warning: package 'ISwR' was built under R version 3.1.2
attach(heart.rate)
heart.rate
## hr subj time
## 1 96 1 0
## 2 110 2 0
## 3 89 3 0
## 4 95 4 0
## 5 128 5 0
## 6 100 6 0
## 7 72 7 0
## 8 79 8 0
## 9 100 9 0
## 10 92 1 30
## 11 106 2 30
## 12 86 3 30
## 13 78 4 30
## 14 124 5 30
## 15 98 6 30
## 16 68 7 30
## 17 75 8 30
## 18 106 9 30
## 19 86 1 60
## 20 108 2 60
## 21 85 3 60
## 22 78 4 60
## 23 118 5 60
## 24 100 6 60
## 25 67 7 60
## 26 74 8 60
## 27 104 9 60
## 28 92 1 120
## 29 114 2 120
## 30 83 3 120
## 31 83 4 120
## 32 118 5 120
## 33 94 6 120
## 34 71 7 120
## 35 74 8 120
## 36 102 9 120
# Same as:
heart.rate <- data.frame(hr = c(96,110,89,95,128,100,72,79,100,
92,106,86,78,124,98,68,75,106,
86,108,85,78,118,100,67,74,104,
92,114,83,83,118,94,71,74,102),
subj=gl(9,1,36),
time=gl(4,9,36,labels=c(0,30,60,120)))
gl(9,1,36)
## [1] 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8
## [36] 9
## Levels: 1 2 3 4 5 6 7 8 9
anova(lm(hr~subj+time))
## Analysis of Variance Table
##
## Response: hr
## Df Sum Sq Mean Sq F value Pr(>F)
## subj 8 8967 1121 90.64 4.9e-16 ***
## time 3 151 50 4.07 0.018 *
## Residuals 24 297 12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
interaction.plot(time, subj, hr)
## Back to Ram's
hm <- ggplot(data=mdf, aes(x=Var1, y=Var2, fill=value)) + geom_tile()
hm
#Good, but the colors are very difficult to interpret.
#Example 5.20b better colors --------------
#'
#Side Example showing the use of colorRampPalette
Red_to_White_range<-colorRampPalette(c("red", "white" ) )
Red_to_White_range(2) # creates a vector of 10 Hex Color values
## [1] "#FF0000" "#FFFFFF"
Red_to_White_range(15) # creates a vector of 5 Hex Color values
## [1] "#FF0000" "#FF1212" "#FF2424" "#FF3636" "#FF4848" "#FF5B5B" "#FF6D6D"
## [8] "#FF7F7F" "#FF9191" "#FFA3A3" "#FFB6B6" "#FFC8C8" "#FFDADA" "#FFECEC"
## [15] "#FFFFFF"
#' End of Side Example. Back to main Heat Map
#set up a coloring scheme using colorRampPalette
red=rgb(1,0,0); green=rgb(0,1,0); blue=rgb(0,0,1); white=rgb(1,1,1)
red_to_White_range<-colorRampPalette(c(red, white ) ) # We can divide this into as many colorvalues as we like
white_to_Green_range<-colorRampPalette(c(white, green) )
########colors are ready
#Let's say that we want "red" colors for negative correlations and "green" for positives.
#(We can gray out the 1 along the diagonal.)
hm <- hm + scale_fill_gradient2(low=red_to_White_range(100),
mid=white_to_Green_range(100),
high="gray")
#Why did I choose fill_gradient2? Because it allows for a c(low,mid, high)
hm
#Example 5.21 ----MAPS
library("ggmap")
locations <- c("PVG", "sFO", "Chennai", "London", "Melbourne", "Johannesburg, SA")
coord_visited <- geocode(locations) #geocode is part of ggmap
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=PVG&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=sFO&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Chennai&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=London&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Melbourne&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Johannesburg,+SA&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
locations_df <- data.frame(locations, coord_visited)
mp <- NULL #clean start
#Using GGPLOT, plot the Base World Map
world_map <- borders("world", colour="gray50", fill="gray50") # create a layer of borders
# ?borders # This is part of ggplot2
#align the two with lat and lon as the common coordinates
mp <- ggplot(locations_df, aes(x=lon, y=lat)) + world_map
mp
#Now Layer the cities on top
mp <- mp+ geom_point(color="blue", size=5)
mp
#we can use any variable for the color aesthetic
mp <- mp+ geom_point(aes(color=lon), size=3)
mp
#we can use any column for the point size aesthetic
mp <- mp+ geom_point(aes(color=lon, size=lat))
mp
#Example 5.22 ---Maps package
# Extra
#PLOTTING USING The maps package.
#That is, without using ggplot2
library(maps)
map("world", fill=TRUE, col="white", bg="lightblue", ylim=c(-60, 90), mar=c(0,0,0,0))
points(locations_df$lon,locations_df$lat, col="red", pch=16)
From: http://dict.baidu.com/s?wd=choropleth+ The character of expressing information of choropleth maps in internet is analyzed, and the main idea and technology of publishing interactive choropleth maps are expatiated. ????????????????????????????????????????????????????????????,????????????????????????????????????????????????????????????
For a choropleth to work, we need TWO data frames. One is a “map” data frame - the lat/long points and a way to identify the shapess. The second df has the values that we want to base our color on, and it has the same IDs. Important: The two data frames are linked by that common “ID” column. That is how ggplot knows to link the two data frames.
Choropleth map???????????????????????????????????????????????????????????????????????????(????????????????????????????????????),??????????????????????????????(??????)??????????????????,??????????????????????????????????????????,??????????????????????????????????????????????????????????????????
#Example 5.23 -------------------------------------
# 1. First, create the maps data frame
# ??map_data
states_map_coords <- map_data("state") # Create a data frame of map data
head(states_map_coords)
## long lat group order region subregion
## 1 -87.46 30.39 1 1 alabama <NA>
## 2 -87.48 30.37 1 2 alabama <NA>
## 3 -87.53 30.37 1 3 alabama <NA>
## 4 -87.53 30.33 1 4 alabama <NA>
## 5 -87.57 30.33 1 5 alabama <NA>
## 6 -87.59 30.33 1 6 alabama <NA>
tail(states_map_coords)
## long lat group order region subregion
## 15594 -106.3 41.01 63 15594 wyoming <NA>
## 15595 -106.9 41.01 63 15595 wyoming <NA>
## 15596 -107.3 41.02 63 15596 wyoming <NA>
## 15597 -107.9 41.02 63 15597 wyoming <NA>
## 15598 -109.1 40.99 63 15598 wyoming <NA>
## 15599 -109.1 41.00 63 15599 wyoming <NA>
dim(states_map_coords)
## [1] 15537 6
head(states_map_coords)
## long lat group order region subregion
## 1 -87.46 30.39 1 1 alabama <NA>
## 2 -87.48 30.37 1 2 alabama <NA>
## 3 -87.53 30.37 1 3 alabama <NA>
## 4 -87.53 30.33 1 4 alabama <NA>
## 5 -87.57 30.33 1 5 alabama <NA>
## 6 -87.59 30.33 1 6 alabama <NA>
#2. Create a data frame with the values (color intensity will be based on that)
stateName <- tolower(state.name) # state.name is part of built-in datasets pkg
stateName
## [1] "alabama" "alaska" "arizona" "arkansas"
## [5] "california" "colorado" "connecticut" "delaware"
## [9] "florida" "georgia" "hawaii" "idaho"
## [13] "illinois" "indiana" "iowa" "kansas"
## [17] "kentucky" "louisiana" "maine" "maryland"
## [21] "massachusetts" "michigan" "minnesota" "mississippi"
## [25] "missouri" "montana" "nebraska" "nevada"
## [29] "new hampshire" "new jersey" "new mexico" "new york"
## [33] "north carolina" "north dakota" "ohio" "oklahoma"
## [37] "oregon" "pennsylvania" "rhode island" "south carolina"
## [41] "south dakota" "tennessee" "texas" "utah"
## [45] "vermont" "virginia" "washington" "west virginia"
## [49] "wisconsin" "wyoming"
name_length <- nchar(stateName) # how many letters in the name
name_length
## [1] 7 6 7 8 10 8 11 8 7 7 6 5 8 7 4 6 8 9 5 8 13 8 9
## [24] 11 8 7 8 6 13 10 10 8 14 12 4 8 6 12 12 14 12 9 5 4 7 8
## [47] 10 13 9 7
values_df<- data.frame(stateName, name_length) #creating a 50x2 data frame here
head(values_df)
## stateName name_length
## 1 alabama 7
## 2 alaska 6
## 3 arizona 7
## 4 arkansas 8
## 5 california 10
## 6 colorado 8
#3. Create a choropleth
# We are going to plot the name_length values, on top of the base map
# The two are linked by the common column called stateName
mp <- ggplot(values_df, aes(map_id=stateName)) + geom_map(aes(fill=name_length), map=states_map_coords)
mp <- mp + expand_limits(x = states_map_coords$long, y = states_map_coords$lat)
# Let's a give it a color-gradient. Low to High
mp <- mp + scale_fill_gradient(low='grey90', high='darkgreen', limits=c(0,14))
mp
mp <- mp + guides(fill=FALSE)
#Optional step. If you want better-looking maps
library(mapproj)
mp <- mp + coord_map("polyconic")
mp
#and that's the choropleth we want
########################
# A.
#ggplot(mtcars, aes(x=factor(cyl), y=mpg, fill=cyl)) + geom_boxplot() + guides(fill=guide_legend(title="Cylinders"))
# B.
ggplot(mtcars, aes(x=cyl, y=mpg, fill=factor(cyl))) + geom_boxplot() + guides(fill=guide_legend(title="Cylinders"))
# C
ggplot(mtcars, aes(x=factor(cyl), y=mpg, fill=cyl)) + geom_boxplot() + scale_color_discrete()+ guides(fill=guide_legend(title="Cylinders"))
# D
ggplot(mtcars, aes(x=factor(cyl), y=mpg, fill=factor(cyl))) + geom_boxplot() + guides(fill=guide_legend(title="Cylinders"))
ggplot(mtcars, aes(x=factor(cyl), y=mpg)) + geom_boxplot()
# Added:
# A.
ggplot(mtcars, aes(x=factor(cyl), y=mpg)) + geom_boxplot() + scale_y_continuous(limits=c(0, 50))
# B
ggplot(mtcars, aes(x=factor(cyl), y=mpg)) + geom_boxplot() + scale_y_continuous(breaks=seq(0, 50, 10))
# C
ggplot(mtcars, aes(x=factor(cyl), y=mpg)) + geom_boxplot() + scale_y_continuous(breaks=0:50)
# D
ggplot(mtcars, aes(x=factor(cyl), y=mpg)) + geom_boxplot() + scale_y_continuous(breaks=c(0,50))
#Original
ggplot(diamonds, aes(x=color, fill=cut)) + geom_bar()
# A.
#ggplot(diamonds, aes(x=color, fill=cut)) + geom_bar(stat="identity")
# Doesn't show anything
# B
ggplot(diamonds, aes(x=color, fill=cut)) + geom_bar(position="dodge")
# C
ggplot(diamonds, aes(x=factor(color), fill=cut)) + geom_bar()
# D
# ggplot(diamonds, aes(x=color, fill=cut)) + geom_bar(color+cut ~ .)
# This one has error
Question 8 of 10
#Data frame 1
crimes <-data.frame(state = tolower(rownames(USArrests)), USArrests)
#Data frame 2
states_map <-map_data("state")
head(states_map)
## long lat group order region subregion
## 1 -87.46 30.39 1 1 alabama <NA>
## 2 -87.48 30.37 1 2 alabama <NA>
## 3 -87.53 30.37 1 3 alabama <NA>
## 4 -87.53 30.33 1 4 alabama <NA>
## 5 -87.57 30.33 1 5 alabama <NA>
## 6 -87.59 30.33 1 6 alabama <NA>
#Now we can plot the choropleth:
ggplot(crimes, aes(map_id = state)) + geom_map(aes(fill = Murder), map = states_map) + expand_limits(x = states_map$long, y = states_map$lat)
# A.
# B
# C
# D
Facet Grid question:
The following graph was created using ggplot’s facet_grid option. It used the mtcars dataset.
The relevant variables are:
mpg - miles per gallon
am - automatic or manual: 0 or 1 in the dataset
gear - 3,4 or 5 gears by automobile
cyl - 4, 6 or 8 cylinder engines.
base_g <- ggplot(mtcars, aes(x=am, y=mpg)) + geom_point()
base_g
base_g + facet_grid(gear~cyl)
base_g + facet_grid (am~cyl) # This one
base_g + facet_grid(am+gear ~ mpg+cyl)
base_g + facet_grid(am+cyl ~ gear)