Exploring Categorical Data
> library(tidyverse)
> library(openintro)
> comics <- read.csv("C:\\Users\\pbj20\\Documents\\R Documents\\comics.csv")
> comics <- as_tibble(comics)
> head(comics)
# A tibble: 6 x 11
name id align eye hair gender gsm alive appearances first_appear
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <chr>
1 "Spi~ Secr~ Good Haze~ Brow~ Male <NA> Livi~ 4043 Aug-62
2 "Cap~ Publ~ Good Blue~ Whit~ Male <NA> Livi~ 3360 Mar-41
3 "Wol~ Publ~ Neut~ Blue~ Blac~ Male <NA> Livi~ 3061 Oct-74
4 "Iro~ Publ~ Good Blue~ Blac~ Male <NA> Livi~ 2961 Mar-63
5 "Tho~ No D~ Good Blue~ Blon~ Male <NA> Livi~ 2258 Nov-50
6 "Ben~ Publ~ Good Blue~ No H~ Male <NA> Livi~ 2255 Nov-61
# ... with 1 more variable: publisher <chr>
Rows: 23,272
Columns: 11
$ name <chr> "Spider-Man (Peter Parker)", "Captain America (Steven ...
$ id <chr> "Secret", "Public", "Public", "Public", "No Dual", "Pu...
$ align <chr> "Good", "Good", "Neutral", "Good", "Good", "Good", "Go...
$ eye <chr> "Hazel Eyes", "Blue Eyes", "Blue Eyes", "Blue Eyes", "...
$ hair <chr> "Brown Hair", "White Hair", "Black Hair", "Black Hair"...
$ gender <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male"...
$ gsm <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ alive <chr> "Living Characters", "Living Characters", "Living Char...
$ appearances <int> 4043, 3360, 3061, 2961, 2258, 2255, 2072, 2017, 1955, ...
$ first_appear <chr> "Aug-62", "Mar-41", "Oct-74", "Mar-63", "Nov-50", "Nov...
$ publisher <chr> "marvel", "marvel", "marvel", "marvel", "marvel", "mar...
> table(comics$id,comics$align)
Bad Good Neutral Reformed Criminals
No Dual 474 647 390 0
Public 2172 2930 965 1
Secret 4493 2475 959 1
Unknown 7 0 2 0
> comics$align <- as.factor(comics$align)
> levels(comics$align)
[1] "Bad" "Good" "Neutral"
[4] "Reformed Criminals"
> comics$name <- as.factor(comics$name)
> comics$id <- as.factor(comics$id)
> comics$eye <- as.factor(comics$eye)
> comics$hair <- as.factor(comics$hair)
> comics$gender <- as.factor(comics$gender)
> comics$alive <- as.factor(comics$alive)
> comics$first_appear <- as.factor(comics$first_appear)
> comics$publisher <- as.factor(comics$publisher)
>
> levels(comics$id)
[1] "No Dual" "Public" "Secret" "Unknown"
> ggplot(comics,aes(x=id,fill=align))+geom_bar()

> # Print the first rows of the data
> head(comics)
# A tibble: 6 x 11
name id align eye hair gender gsm alive appearances first_appear
<fct> <fct> <fct> <fct> <fct> <fct> <chr> <fct> <int> <fct>
1 "Spi~ Secr~ Good Haze~ Brow~ Male <NA> Livi~ 4043 Aug-62
2 "Cap~ Publ~ Good Blue~ Whit~ Male <NA> Livi~ 3360 Mar-41
3 "Wol~ Publ~ Neut~ Blue~ Blac~ Male <NA> Livi~ 3061 Oct-74
4 "Iro~ Publ~ Good Blue~ Blac~ Male <NA> Livi~ 2961 Mar-63
5 "Tho~ No D~ Good Blue~ Blon~ Male <NA> Livi~ 2258 Nov-50
6 "Ben~ Publ~ Good Blue~ No H~ Male <NA> Livi~ 2255 Nov-61
# ... with 1 more variable: publisher <fct>
> # Check levels of align
> levels(comics$align)
[1] "Bad" "Good" "Neutral"
[4] "Reformed Criminals"
> # Check the levels of gender
> levels(comics$gender)
[1] "Female" "Male" "Other"
> # Create a 2-way contingency table
> tab <- table(comics$align,comics$gender)
>
> # Print tab
> print(tab)
Female Male Other
Bad 1573 7561 32
Good 2490 4809 17
Neutral 836 1799 17
Reformed Criminals 1 2 0
> # Remove align level
> comics_filtered <- comics %>%
+ filter(align != "Reformed Criminals") %>%
+ droplevels()
> # See the result
> comics_filtered
# A tibble: 19,856 x 11
name id align eye hair gender gsm alive appearances first_appear
<fct> <fct> <fct> <fct> <fct> <fct> <chr> <fct> <int> <fct>
1 "Spi~ Secr~ Good Haze~ Brow~ Male <NA> Livi~ 4043 Aug-62
2 "Cap~ Publ~ Good Blue~ Whit~ Male <NA> Livi~ 3360 Mar-41
3 "Wol~ Publ~ Neut~ Blue~ Blac~ Male <NA> Livi~ 3061 Oct-74
4 "Iro~ Publ~ Good Blue~ Blac~ Male <NA> Livi~ 2961 Mar-63
5 "Tho~ No D~ Good Blue~ Blon~ Male <NA> Livi~ 2258 Nov-50
6 "Ben~ Publ~ Good Blue~ No H~ Male <NA> Livi~ 2255 Nov-61
7 "Ree~ Publ~ Good Brow~ Brow~ Male <NA> Livi~ 2072 Nov-61
8 "Hul~ Publ~ Good Brow~ Brow~ Male <NA> Livi~ 2017 May-62
9 "Sco~ Publ~ Neut~ Brow~ Brow~ Male <NA> Livi~ 1955 Sep-63
10 "Jon~ Publ~ Good Blue~ Blon~ Male <NA> Livi~ 1934 Nov-61
# ... with 19,846 more rows, and 1 more variable: publisher <fct>
> # Create side-by-side barchart of gender by alignment
> ggplot(comics_filtered, aes(x = align, fill = gender)) +
+ geom_bar(position = "dodge")

> # Create side-by-side barchart of alignment by gender
> ggplot(comics_filtered, aes(x = gender, fill = align)) +
+ geom_bar(position = "dodge") +
+ theme(axis.text.x = element_text(angle = 90))

> options(scipen = 999, digits =3) #simplify display format
> tab_cnt <- table(comics_filtered$id, comics_filtered$align)
> tab_cnt
Bad Good Neutral
No Dual 474 647 390
Public 2172 2930 965
Secret 4493 2475 959
Unknown 7 0 2
Bad Good Neutral
No Dual 0.030553 0.041704 0.025139
Public 0.140003 0.188862 0.062202
Secret 0.289609 0.159533 0.061815
Unknown 0.000451 0.000000 0.000129
> sum(prop.table(tab_cnt))
[1] 1
> # each row = 100%
> prop.table(tab_cnt,1)
Bad Good Neutral
No Dual 0.314 0.428 0.258
Public 0.358 0.483 0.159
Secret 0.567 0.312 0.121
Unknown 0.778 0.000 0.222
> # each column = 100%
> prop.table(tab_cnt,2)
Bad Good Neutral
No Dual 0.066331 0.106907 0.168394
Public 0.303946 0.484137 0.416667
Secret 0.628743 0.408956 0.414076
Unknown 0.000980 0.000000 0.000864
> ggplot(comics_filtered, aes(x = id, fill=align))+
+ geom_bar(position = "fill") +
+ ylab("proportion")

> ggplot(comics_filtered, aes(x = align, fill=id))+
+ geom_bar(position = "fill") +
+ ylab("proportion")

> tab <- table(comics_filtered$align, comics_filtered$gender)
> options(scipen = 999, digits = 3) # Print fewer digits
> prop.table(tab) # Joint proportions
Female Male Other
Bad 0.082210 0.395160 0.001672
Good 0.130135 0.251333 0.000888
Neutral 0.043692 0.094021 0.000888
> prop.table(tab, 2) # Conditional on columns
Female Male Other
Bad 0.321 0.534 0.485
Good 0.508 0.339 0.258
Neutral 0.171 0.127 0.258
> # Plot of gender by align
> ggplot(comics_filtered, aes(x = align, fill = gender)) +
+ geom_bar()

> # Plot proportion of gender, conditional on align
> ggplot(comics_filtered, aes(x = align, fill = gender)) +
+ geom_bar(position = "fill") +
+ ylab("proportion")

No Dual Public Secret Unknown
1788 6994 8698 9
> ggplot(comics_filtered, aes(x=id)) + geom_bar() +
+ facet_wrap(~align) +
+ theme(axis.text.x = element_text(angle = 90))

> # Change the order of the levels in align
> comics_filtered$align <- factor(comics_filtered$align,
+ levels = c("Bad", "Neutral", "Good"))
>
> # Create plot of align
> ggplot(comics_filtered, aes(x = align)) +
+ geom_bar()

> # Plot of alignment broken down by gender
> ggplot(comics_filtered, aes(x = align)) +
+ geom_bar() +
+ facet_wrap(~ gender)

Exploring Numerical Data
> cars <- read_csv("C:\\Users\\pbj20\\Documents\\R Documents\\cars.csv")
>
> glimpse(cars)
Rows: 428
Columns: 19
$ name <chr> "Chevrolet Aveo 4dr", "Chevrolet Aveo LS 4dr hatch", "C...
$ sports_car <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ suv <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ wagon <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ minivan <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ pickup <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ all_wheel <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ rear_wheel <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ msrp <dbl> 11690, 12585, 14610, 14810, 16385, 13670, 15040, 13270,...
$ dealer_cost <dbl> 10965, 11802, 13697, 13884, 15357, 12849, 14086, 12482,...
$ eng_size <dbl> 1.6, 1.6, 2.2, 2.2, 2.2, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...
$ ncyl <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4...
$ horsepwr <dbl> 103, 103, 140, 140, 140, 132, 132, 130, 110, 130, 130, ...
$ city_mpg <dbl> 28, 28, 26, 26, 26, 29, 29, 26, 27, 26, 26, 32, 36, 32,...
$ hwy_mpg <dbl> 34, 34, 37, 37, 37, 36, 36, 33, 36, 33, 33, 38, 44, 38,...
$ weight <dbl> 2370, 2348, 2617, 2676, 2617, 2581, 2626, 2612, 2606, 2...
$ wheel_base <dbl> 98, 98, 104, 104, 104, 105, 105, 103, 103, 103, 103, 10...
$ length <dbl> 167, 153, 183, 183, 183, 174, 174, 168, 168, 168, 168, ...
$ width <dbl> 66, 66, 69, 68, 69, 67, 67, 67, 67, 67, 67, 67, 67, 68,...
> # Learn data structure
> str(cars)
tibble [428 x 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ name : chr [1:428] "Chevrolet Aveo 4dr" "Chevrolet Aveo LS 4dr hatch" "Chevrolet Cavalier 2dr" "Chevrolet Cavalier 4dr" ...
$ sports_car : logi [1:428] FALSE FALSE FALSE FALSE FALSE FALSE ...
$ suv : logi [1:428] FALSE FALSE FALSE FALSE FALSE FALSE ...
$ wagon : logi [1:428] FALSE FALSE FALSE FALSE FALSE FALSE ...
$ minivan : logi [1:428] FALSE FALSE FALSE FALSE FALSE FALSE ...
$ pickup : logi [1:428] FALSE FALSE FALSE FALSE FALSE FALSE ...
$ all_wheel : logi [1:428] FALSE FALSE FALSE FALSE FALSE FALSE ...
$ rear_wheel : logi [1:428] FALSE FALSE FALSE FALSE FALSE FALSE ...
$ msrp : num [1:428] 11690 12585 14610 14810 16385 ...
$ dealer_cost: num [1:428] 10965 11802 13697 13884 15357 ...
$ eng_size : num [1:428] 1.6 1.6 2.2 2.2 2.2 2 2 2 2 2 ...
$ ncyl : num [1:428] 4 4 4 4 4 4 4 4 4 4 ...
$ horsepwr : num [1:428] 103 103 140 140 140 132 132 130 110 130 ...
$ city_mpg : num [1:428] 28 28 26 26 26 29 29 26 27 26 ...
$ hwy_mpg : num [1:428] 34 34 37 37 37 36 36 33 36 33 ...
$ weight : num [1:428] 2370 2348 2617 2676 2617 ...
$ wheel_base : num [1:428] 98 98 104 104 104 105 105 103 103 103 ...
$ length : num [1:428] 167 153 183 183 183 174 174 168 168 168 ...
$ width : num [1:428] 66 66 69 68 69 67 67 67 67 67 ...
- attr(*, "spec")=
.. cols(
.. name = col_character(),
.. sports_car = col_logical(),
.. suv = col_logical(),
.. wagon = col_logical(),
.. minivan = col_logical(),
.. pickup = col_logical(),
.. all_wheel = col_logical(),
.. rear_wheel = col_logical(),
.. msrp = col_double(),
.. dealer_cost = col_double(),
.. eng_size = col_double(),
.. ncyl = col_double(),
.. horsepwr = col_double(),
.. city_mpg = col_double(),
.. hwy_mpg = col_double(),
.. weight = col_double(),
.. wheel_base = col_double(),
.. length = col_double(),
.. width = col_double()
.. )
> # Create faceted histogram
> ggplot(cars, aes(x = city_mpg)) +
+ geom_histogram() +
+ facet_wrap(~ suv)

[1] 4 6 3 8 5 12 10 -1
> # Filter cars with 4, 6, 8 cylinders
> common_cyl <- filter(cars, ncyl %in% c(4, 6, 8))
>
> # Create box plots of city mpg by ncyl
> ggplot(common_cyl, aes(x = as.factor(ncyl), y = city_mpg)) +
+ geom_boxplot()

> # Create overlaid density plots for same data
> ggplot(common_cyl, aes(x = city_mpg, fill = as.factor(ncyl))) +
+ geom_density(alpha = .3)

> cars %>% filter(eng_size < 2.0) %>%
+ ggplot(aes(x=hwy_mpg)) +
+ geom_histogram()

> cars %>% filter(eng_size < 2.0) %>%
+ ggplot(aes(x=hwy_mpg)) +
+ geom_histogram(binwidth = 5)

> cars %>% filter(eng_size < 2.0) %>%
+ ggplot(aes(x=hwy_mpg)) +
+ geom_density(bw = 5)

> # Create hist of horsepwr
> cars %>%
+ ggplot(aes(x=horsepwr)) +
+ geom_histogram() +
+ ggtitle("Hist of Horsepwr")

> # Create hist of horsepwr for affordable cars
> cars %>%
+ filter(msrp<25000) %>%
+ ggplot(aes(x=horsepwr)) +
+ geom_histogram() +
+ xlim(c(90, 550)) +
+ ggtitle("Hist of Horsepwr")

> # Create hist of horsepwr with binwidth of 3
> cars %>%
+ ggplot(aes(x=horsepwr)) +
+ geom_histogram(binwidth = 3) +
+ ggtitle("Hist bin 3")

> # Create hist of horsepwr with binwidth of 30
> cars %>%
+ ggplot(aes(x=horsepwr)) +
+ geom_histogram(binwidth = 30) +
+ ggtitle("Hist bin 30")

> # Create hist of horsepwr with binwidth of 60
> cars %>%
+ ggplot(aes(x=horsepwr)) +
+ geom_histogram(binwidth = 60) +
+ ggtitle("Hist bin 60")

> # Use x=1 for histogram with 1 x variable
>
> # Construct box plot of msrp
> cars %>%
+ ggplot(aes(x = 1, y = msrp)) +
+ geom_boxplot()

> # Exclude outliers from data
> cars_no_out <- cars %>%
+ filter(msrp<100000)
>
> # Construct box plot of msrp using the reduced dataset
> cars_no_out %>%
+ ggplot(aes(x = 1, y = msrp)) +
+ geom_boxplot()

> # Create plot of city_mpg
> cars %>%
+ ggplot(aes(x=1, y=city_mpg)) +
+ geom_boxplot()

> cars %>%
+ ggplot(aes(x=city_mpg)) +
+ geom_density()

> # Create plot of width
> cars %>%
+ ggplot(aes(x=1, y=width)) +
+ geom_boxplot()

> cars %>%
+ ggplot(aes(x=width)) +
+ geom_density()

> ggplot(cars, aes(x=msrp))+
+ geom_density()+
+ facet_grid(pickup ~ rear_wheel)

> #pickup rows, rear_wheel columns
>
> ggplot(cars, aes(x=msrp))+
+ geom_density()+
+ facet_grid(pickup ~ rear_wheel, labeller = label_both)

> table(wheel = cars$rear_wheel, pickup = cars$pickup)
pickup
wheel FALSE TRUE
FALSE 306 12
TRUE 98 12
> # Facet hists using hwy mileage and ncyl
> common_cyl %>%
+ ggplot(aes(x = hwy_mpg)) +
+ geom_histogram() +
+ facet_grid(ncyl ~ suv) +
+ ggtitle("By ncyl and suv")

Case Study
# A tibble: 6 x 21
spam to_multiple from cc sent_email time image attach
<dbl> <dbl> <dbl> <int> <dbl> <dttm> <dbl> <dbl>
1 0 0 1 0 0 2012-01-01 01:16:41 0 0
2 0 0 1 0 0 2012-01-01 02:03:59 0 0
3 0 0 1 0 0 2012-01-01 11:00:32 0 0
4 0 0 1 0 0 2012-01-01 04:09:49 0 0
5 0 0 1 0 0 2012-01-01 05:00:01 0 0
6 0 0 1 0 0 2012-01-01 05:04:46 0 0
# ... with 13 more variables: dollar <dbl>, winner <fct>, inherit <dbl>,
# viagra <dbl>, password <dbl>, num_char <dbl>, line_breaks <int>,
# format <dbl>, re_subj <dbl>, exclaim_subj <dbl>, urgent_subj <dbl>,
# exclaim_mess <dbl>, number <fct>
Rows: 3,921
Columns: 21
$ spam <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ from <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ cc <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 2, 1, 0, 2, ...
$ sent_email <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, ...
$ time <dttm> 2012-01-01 01:16:41, 2012-01-01 02:03:59, 2012-01-01 ...
$ image <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ attach <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ dollar <dbl> 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 5, 0, ...
$ winner <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no...
$ inherit <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ viagra <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ password <dbl> 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
$ num_char <dbl> 11.370, 10.504, 7.773, 13.256, 1.231, 1.091, 4.837, 7....
$ line_breaks <int> 202, 202, 192, 255, 29, 25, 193, 237, 69, 68, 25, 79, ...
$ format <dbl> 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...
$ re_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, ...
$ exclaim_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
$ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ exclaim_mess <dbl> 0, 1, 6, 48, 1, 1, 1, 18, 1, 0, 2, 1, 0, 10, 4, 10, 20...
$ number <fct> big, small, small, small, none, none, big, small, smal...
> # Compute summary statistics
> email %>%
+ group_by(spam) %>%
+ summarize(median(num_char),
+ IQR(num_char))
# A tibble: 2 x 3
spam `median(num_char)` `IQR(num_char)`
<dbl> <dbl> <dbl>
1 0 6.83 13.6
2 1 1.05 2.82
> # Create plot
> email %>%
+ mutate(log_num_char = log(num_char)) %>%
+ ggplot(aes(x = spam, y = log_num_char)) +
+ geom_boxplot()

> # Compute center and spread for exclaim_mess by spam
> email %>%
+ group_by(spam) %>%
+ summarize(median(exclaim_mess),
+ IQR(exclaim_mess))
# A tibble: 2 x 3
spam `median(exclaim_mess)` `IQR(exclaim_mess)`
<dbl> <dbl> <dbl>
1 0 1 5
2 1 0 1
> # Create plot for spam and exclaim_mess
> email %>%
+ mutate(log_exclaim_mess = log(exclaim_mess + 0.01)) %>%
+ ggplot(aes(x = log_exclaim_mess)) +
+ geom_histogram() +
+ facet_wrap(~ spam)

> # Alternative plot: side-by-side box plots
> email %>%
+ mutate(log_exclaim_mess = log(exclaim_mess + 0.01)) %>%
+ ggplot(aes(x = 1, y = log_exclaim_mess)) +
+ geom_boxplot() +
+ facet_wrap(~ spam)

> # Alternative plot: Overlaid density plots
> email %>%
+ mutate(log_exclaim_mess = log(exclaim_mess + .01)) %>%
+ ggplot(aes(x = log_exclaim_mess, fill = spam)) +
+ geom_density(alpha = 0.3)

0 1 2 3 4 5 9 20
3811 76 17 11 2 2 1 1
> # Create plot of proportion of spam by image
> email %>%
+ mutate(has_image = image > 0 ) %>%
+ ggplot(aes(x = has_image, fill = spam)) +
+ geom_bar(position = "fill")

> sum(email$num_char < 0 )
[1] 0
> # Test if images count as attachments
> sum(email$image > email$attach)
[1] 0
> # Question 1
> email %>%
+ filter(dollar > 0) %>%
+ group_by(spam) %>%
+ summarize(n())
# A tibble: 2 x 2
spam `n()`
<dbl> <int>
1 0 668
2 1 78
> # Question 2
> email %>%
+ filter(dollar >10) %>%
+ ggplot(aes(x = spam)) +
+ geom_bar()

> # Reorder levels
> email$number_reordered <- factor(email$number, levels = c("none", "small", "big"))
>
> # Construct plot of number_reordered
> ggplot(email, aes(x = number_reordered)) +
+ geom_bar() +
+ facet_wrap(~ spam)
