library( dplyr )
library( ggplot2 )
library( gapminder )
library( openintro )
Exploring Categorical Data
Comics Dataset
comicsurl <- 'https://assets.datacamp.com/production/repositories/537/datasets/8860af2c0ef67fc77a8c704a73bbb93a395debcf/comics.csv'
comics <- read.csv( comicsurl )
glimpse( comics )
## Rows: 23,272
## Columns: 11
## $ name <chr> "Spider-Man (Peter Parker)", "Captain America (Steven Ro…
## $ id <chr> "Secret", "Public", "Public", "Public", "No Dual", "Publ…
## $ align <chr> "Good", "Good", "Neutral", "Good", "Good", "Good", "Good…
## $ eye <chr> "Hazel Eyes", "Blue Eyes", "Blue Eyes", "Blue Eyes", "Bl…
## $ hair <chr> "Brown Hair", "White Hair", "Black Hair", "Black Hair", …
## $ gender <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male", …
## $ gsm <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ alive <chr> "Living Characters", "Living Characters", "Living Charac…
## $ appearances <int> 4043, 3360, 3061, 2961, 2258, 2255, 2072, 2017, 1955, 19…
## $ first_appear <chr> "Aug-62", "Mar-41", "Oct-74", "Mar-63", "Nov-50", "Nov-6…
## $ publisher <chr> "marvel", "marvel", "marvel", "marvel", "marvel", "marve…
#contingency table
table( comics$id, comics$align )
##
## Bad Good Neutral Reformed Criminals
## No Dual 474 647 390 0
## Public 2172 2930 965 1
## Secret 4493 2475 959 1
## Unknown 7 0 2 0
Visualizing.
ggplot( data, aes( x=var1, fill=var2 ) ) + layer_name()
- the data (inferred if piped)
- the variables to be plotted
- layers built up in succession
ggplot( comics, aes( x=id, fill=align ) ) + geom_bar()
# Create side-by-side barchart of alignment by alignment
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position= 'dodge' ) +
theme(axis.text.x = element_text(angle = 30))
# Create side-by-side barchart of alignment by gender
ggplot(comics, aes(x = gender, fill = align)) +
geom_bar(position= 'dodge' ) +
theme(axis.text.x = element_text(angle = 30))
Counts vs. Proportions
options( scipen = 999, digits = 3 )
#Allow the user to set and examine a variety of global options which affect the way in which R computes and displays its results.
tab_cnt <- table( comics$id, comics$align )
tab_prob <- prop.table( tab_cnt )
( tab_prob )
##
## Bad Good Neutral Reformed Criminals
## No Dual 0.0305491 0.0416989 0.0251353 0.0000000
## Public 0.1399845 0.1888373 0.0621939 0.0000644
## Secret 0.2895721 0.1595128 0.0618072 0.0000644
## Unknown 0.0004511 0.0000000 0.0001289 0.0000000
sum( tab_prob )
## [1] 1
Conditional Proportions
prop.table( tab_cnt, 1 ) #Condition on the rows (every row adds to 1)
##
## Bad Good Neutral Reformed Criminals
## No Dual 0.313700 0.428193 0.258107 0.000000
## Public 0.357943 0.482861 0.159031 0.000165
## Secret 0.566726 0.312185 0.120964 0.000126
## Unknown 0.777778 0.000000 0.222222 0.000000
margin.table( tab_cnt, 1 ) #sum on the rows
##
## No Dual Public Secret Unknown
## 1511 6068 7928 9
margin.table( tab_cnt, 2) #sum on the columns
##
## Bad Good Neutral Reformed Criminals
## 7146 6052 2316 2
prop.table( tab_cnt, 2 ) #Condition on the columns (every column adds to 1)
##
## Bad Good Neutral Reformed Criminals
## No Dual 0.066331 0.106907 0.168394 0.000000
## Public 0.303946 0.484137 0.416667 0.500000
## Secret 0.628743 0.408956 0.414076 0.500000
## Unknown 0.000980 0.000000 0.000864 0.000000
#Conditional Bar Chart
#condition on id
ggplot( comics, aes( x=id, fill=align ) ) +
geom_bar( position = 'fill' ) +
ylab( 'proportion' )
#condition on alignment
ggplot( comics, aes( x=align, fill=id ) ) +
geom_bar( position = 'fill' ) +
ylab( 'proportion' )
tab <- table(comics$align, comics$gender)
options(scipen = 999, digits = 2) # Print fewer digits
prop.table(tab) # Joint proportions
##
## Female Male Other
## Bad 0.082197 0.395099 0.001672
## Good 0.130114 0.251293 0.000888
## Neutral 0.043685 0.094006 0.000888
## Reformed Criminals 0.000052 0.000105 0.000000
prop.table(tab, 2) # Conditional on columns
##
## Female Male Other
## Bad 0.32102 0.53355 0.48485
## Good 0.50816 0.33936 0.25758
## Neutral 0.17061 0.12695 0.25758
## Reformed Criminals 0.00020 0.00014 0.00000
Distribution of one variable
Marginal Distribution
table( comics$id )
##
## No Dual Public Secret Unknown
## 1788 6994 8698 9
Faceting: breaks the dataset down to levels among variables
ggplot( comics, aes( x=id ) ) +
geom_bar() +
facet_wrap( ~align ) +
theme(axis.text.x = element_text(angle = 45))
it’s like an expansion of a stacked bar chart
# Change the order of the levels in align
comics <- comics %>% filter( align %in% c('Bad', 'Neutral', 'Good') )
comics$align <- factor(comics$align,
levels = c('Bad', 'Neutral', 'Good') )
# Create plot of align
ggplot(comics, aes(x = align)) +
geom_bar()
# Change the order of the levels in align
comics <- comics %>% filter( gender %in% c('Female', 'Male') )
comics$gender <- factor(comics$gender,
levels = c('Female', 'Male') )
# Create plot of align
ggplot(comics, aes(x = align)) +
geom_bar() +
facet_wrap( ~gender )
Exploring Numerical Data
carsurl <- 'https://assets.datacamp.com/production/repositories/537/datasets/c0366d5da5ee8dce49919a5443685cf2e50c6a96/cars04.csv'
cars_df <- read.csv( carsurl )
str( cars_df )
## 'data.frame': 428 obs. of 19 variables:
## $ name : chr "Chevrolet Aveo 4dr" "Chevrolet Aveo LS 4dr hatch" "Chevrolet Cavalier 2dr" "Chevrolet Cavalier 4dr" ...
## $ sports_car : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ suv : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ wagon : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ minivan : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ pickup : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ all_wheel : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ rear_wheel : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ msrp : int 11690 12585 14610 14810 16385 13670 15040 13270 13730 15460 ...
## $ dealer_cost: int 10965 11802 13697 13884 15357 12849 14086 12482 12906 14496 ...
## $ eng_size : num 1.6 1.6 2.2 2.2 2.2 2 2 2 2 2 ...
## $ ncyl : int 4 4 4 4 4 4 4 4 4 4 ...
## $ horsepwr : int 103 103 140 140 140 132 132 130 110 130 ...
## $ city_mpg : int 28 28 26 26 26 29 29 26 27 26 ...
## $ hwy_mpg : int 34 34 37 37 37 36 36 33 36 33 ...
## $ weight : int 2370 2348 2617 2676 2617 2581 2626 2612 2606 2606 ...
## $ wheel_base : int 98 98 104 104 104 105 105 103 103 103 ...
## $ length : int 167 153 183 183 183 174 174 168 168 168 ...
## $ width : int 66 66 69 68 69 67 67 67 67 67 ...
Dotplots
ggplot( cars_df, aes( x = weight ) ) +
geom_dotplot( dotsize = 0.3 )
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bindot).
Histogram
ggplot( cars_df, aes( x = weight ) ) +
geom_histogram( )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).
Density plot
ggplot( cars_df, aes( x=weight ) ) +
geom_density()
## Warning: Removed 2 rows containing non-finite values (stat_density).
Boxplot
ggplot( cars_df, aes( x = 1, y = weight ) ) +
geom_boxplot() +
coord_flip()
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
Faceted Histogram
ggplot( cars_df, aes( x = hwy_mpg ) ) +
geom_histogram() +
facet_wrap( ~pickup )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 14 rows containing non-finite values (stat_bin).
# Filter cars with 4, 6, 8 cylinders
common_cyl <- filter(cars_df, ncyl %in% c( 4, 6, 8 ))
# Create box plots of city mpg by ncyl
ggplot(common_cyl, aes(x = as.factor(ncyl), y = city_mpg)) +
geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
# Create overlaid density plots for same data
ggplot(common_cyl, aes(x = city_mpg, fill = as.factor( ncyl ))) +
geom_density(alpha = .3)
## Warning: Removed 11 rows containing non-finite values (stat_density).
Distribution of one variable
building a data pipeline
cars_df %>%
filter( eng_size < 2.0 ) %>%
ggplot( aes( hwy_mpg ) ) +
geom_histogram( binwidth = 5 )
cars_df %>%
filter( eng_size < 2.0 ) %>%
ggplot( aes( hwy_mpg ) ) +
geom_density( bw = 5 )
# Create hist of horsepwr
cars_df %>%
ggplot(aes(horsepwr)) +
geom_histogram() +
ggtitle('hist of horsepwr')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Create hist of horsepwr for affordable cars
cars_df %>%
filter(msrp < 25000) %>%
ggplot(aes(horsepwr)) +
geom_histogram() +
xlim(c(90, 550)) +
ggtitle('hist of horsepwr for affordable cars')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
# Create hist of horsepwr with binwidth of 3
cars_df %>%
ggplot(aes(horsepwr)) +
geom_histogram(binwidth = 3) +
ggtitle('horsepwr with binwidth of 3')
# Create hist of horsepwr with binwidth of 30
cars_df %>%
ggplot(aes(horsepwr)) +
geom_histogram(binwidth = 30) +
ggtitle('horsepwr with binwidth of 30')
# Create hist of horsepwr with binwidth of 60
cars_df %>%
ggplot(aes(horsepwr)) +
geom_histogram(binwidth = 60) +
ggtitle('horsepwr with binwidth of 60')
Box Plots
ggplot( common_cyl, aes( as.factor( ncyl ), y = city_mpg ) ) +
geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
# Construct box plot of msrp
cars_df %>%
ggplot(aes(x = 1, y = msrp)) +
geom_boxplot()
# Exclude outliers from data
cars_no_out <- cars_df %>%
filter(msrp < 100000)
# Construct box plot of msrp using the reduced dataset
cars_no_out %>%
ggplot(aes(x = 1, y = msrp)) +
geom_boxplot()
Visualization in higher dimensions
Facet Grid
ggplot( cars_df, aes( x = msrp ) ) +
geom_density() +
facet_grid( pickup ~ rear_wheel, labeller = label_both )
table( cars_df$rear_wheel, cars_df$pickup )
##
## FALSE TRUE
## FALSE 306 12
## TRUE 98 12
# Facet hists using hwy mileage and ncyl
common_cyl %>%
ggplot(aes(x = hwy_mpg)) +
geom_histogram() +
facet_grid(ncyl ~ suv, labeller = label_both ) +
ggtitle('Some title')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 11 rows containing non-finite values (stat_bin).
Measures of Center
life_url <- 'https://assets.datacamp.com/production/repositories/537/datasets/e079a96a639aa10afc478359da45f2f75f7efd2e/life_exp_raw.csv'
life <- read.csv( life_url )
glimpse( life )
## Rows: 81,691
## Columns: 10
## $ State <chr> "Alabama", "Alabama", "Alaba…
## $ County <chr> "Autauga County", "Baldwin C…
## $ fips <int> 1001, 1003, 1005, 1007, 1009…
## $ Year <int> 1985, 1985, 1985, 1985, 1985…
## $ Female.life.expectancy..years. <dbl> 77, 79, 76, 77, 79, 75, 77, …
## $ Female.life.expectancy..national..years. <dbl> 78, 78, 78, 78, 78, 78, 78, …
## $ Female.life.expectancy..state..years. <dbl> 77, 77, 77, 77, 77, 77, 77, …
## $ Male.life.expectancy..years. <dbl> 68, 71, 67, 67, 71, 67, 68, …
## $ Male.life.expectancy..national..years. <dbl> 71, 71, 71, 71, 71, 71, 71, …
## $ Male.life.expectancy..state..years. <dbl> 69, 69, 69, 69, 69, 69, 69, …
colnames( life )
## [1] "State"
## [2] "County"
## [3] "fips"
## [4] "Year"
## [5] "Female.life.expectancy..years."
## [6] "Female.life.expectancy..national..years."
## [7] "Female.life.expectancy..state..years."
## [8] "Male.life.expectancy..years."
## [9] "Male.life.expectancy..national..years."
## [10] "Male.life.expectancy..state..years."
Numerical Summaries
Median: robust. Mean: sensitive to extreme values
life <- life %>%
mutate( west_coast = State %in% c( 'California', 'Oregon', 'Washington' ) ) %>%
group_by( west_coast ) %>%
summarise( mean( Female.life.expectancy..years.),
mean( Male.life.expectancy..years.),
median( Female.life.expectancy..years. ),
median( Male.life.expectancy..years.))
## `summarise()` ungrouping output (override with `.groups` argument)
life
## # A tibble: 2 x 5
## west_coast `mean(Female.li… `mean(Male.life… `median(Female.… `median(Male.li…
## <lgl> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 78.7 72.6 78.8 72.8
## 2 TRUE 79.6 74.4 79.5 74.3
data( gapminder )
glimpse( gapminder )
## Rows: 1,704
## Columns: 6
## $ country <fct> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afghani…
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia,…
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997,…
## $ lifeExp <dbl> 29, 30, 32, 34, 36, 38, 40, 41, 42, 42, 42, 44, 55, 59, 65,…
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 1…
## $ gdpPercap <dbl> 779, 821, 853, 836, 740, 786, 978, 852, 649, 635, 727, 975,…
# Create dataset of 2007 data
gap2007 <- filter(gapminder, year == 2007)
# Compute groupwise mean and median lifeExp
gap2007 %>%
group_by(continent) %>%
summarize(mean( lifeExp ),
median( lifeExp ) )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 3
## continent `mean(lifeExp)` `median(lifeExp)`
## <fct> <dbl> <dbl>
## 1 Africa 54.8 52.9
## 2 Americas 73.6 72.9
## 3 Asia 70.7 72.4
## 4 Europe 77.6 78.6
## 5 Oceania 80.7 80.7
# Generate box plots of lifeExp for each continent
gap2007 %>%
ggplot(aes(x = continent, y = lifeExp)) +
geom_boxplot()
Visualization in higher dimensions
ggplot( cars_df, aes( x=msrp ) ) +
geom_density() +
facet_grid( pickup ~ rear_wheel ) #facel_grid( rows ~ columns )
Measures of Variability
Variance: take the sum of the squares of the distances of each measure in the group from the group mean. Then, divide the sum by the number of measurements - 1. Standard deviation: square rrot of the variance.
Standard deviation is pretty common to use, because it is a measure in the same unitsas the data.
#Standard deviation
sd( gapminder$lifeExp )
## [1] 13
#Variance
var( gapminder$lifeExp )
## [1] 167
#summary stats
summary( gapminder$lifeExp )
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24 48 61 59 71 83
#inter quartile range
IQR( gapminder$lifeExp )
## [1] 23
IQR is a more robust measure of data spread than range; less impacted by outliers.
# Compute groupwise measures of spread
gap2007 %>%
group_by(continent) %>%
summarize(sd(lifeExp),
IQR(lifeExp),
n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 4
## continent `sd(lifeExp)` `IQR(lifeExp)` `n()`
## <fct> <dbl> <dbl> <int>
## 1 Africa 9.63 11.6 52
## 2 Americas 4.44 4.63 25
## 3 Asia 7.96 10.2 33
## 4 Europe 2.98 4.78 30
## 5 Oceania 0.729 0.516 2
# Generate overlaid density plots
gap2007 %>%
ggplot(aes(x = lifeExp, fill = continent)) +
geom_density(alpha = 0.3)
#based on the graphs, select the most appropriate measures of the data center & spread.
gap2007 %>%
filter(continent == 'Americas') %>%
ggplot( aes( lifeExp ) ) +
geom_density()
# Compute stats for lifeExp in Americas
gap2007 %>%
filter(continent == 'Americas') %>%
summarize(mean( lifeExp ),
sd( lifeExp ))
## # A tibble: 1 x 2
## `mean(lifeExp)` `sd(lifeExp)`
## <dbl> <dbl>
## 1 73.6 4.44
ggplot( gap2007, aes( pop ) ) +
geom_density()
# Compute stats for population
gap2007 %>%
summarize(median( pop ),
IQR( pop ))
## # A tibble: 1 x 2
## `median(pop)` `IQR(pop)`
## <dbl> <dbl>
## 1 10517531 26702008.
# Create density plot of old variable
gap2007 %>%
ggplot(aes(x = pop)) +
geom_density()
# Transform the skewed pop variable
gap2007 <- gap2007 %>%
mutate(log_pop = log( pop ))
# Create density plot of new variable
gap2007 %>%
ggplot(aes(x = log_pop)) +
geom_density()
Outliers
# Filter for Asia, add column indicating outliers
gap_asia <- gap2007 %>%
filter(continent == 'Asia' ) %>%
mutate(is_outlier = lifeExp < 50 )
# Remove outliers, create box plot of lifeExp
gap_asia %>%
filter(!is_outlier) %>%
ggplot(aes(x = 1, y = lifeExp)) +
geom_boxplot()
Case Study
Email Data: What characteristics of an email are associated with it being spam?
glimpse( email )
## Rows: 3,921
## Columns: 21
## $ spam <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ from <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ cc <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 2, 1, 0, 2, 0,…
## $ sent_email <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,…
## $ time <dttm> 2012-01-01 01:16:41, 2012-01-01 02:03:59, 2012-01-01 11…
## $ image <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ attach <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ dollar <dbl> 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 5, 0, 0,…
## $ winner <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, …
## $ inherit <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ viagra <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ password <dbl> 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ num_char <dbl> 11.37, 10.50, 7.77, 13.26, 1.23, 1.09, 4.84, 7.42, 3.05,…
## $ line_breaks <int> 202, 202, 192, 255, 29, 25, 193, 237, 69, 68, 25, 79, 19…
## $ format <dbl> 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,…
## $ re_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,…
## $ exclaim_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ exclaim_mess <dbl> 0, 1, 6, 48, 1, 1, 1, 18, 1, 0, 2, 1, 0, 10, 4, 10, 20, …
## $ number <fct> big, small, small, small, none, none, big, small, small,…
# Compute summary statistics
email %>%
group_by( spam ) %>%
summarize( median( num_char ), IQR( num_char ) )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## spam `median(num_char)` `IQR(num_char)`
## <dbl> <dbl> <dbl>
## 1 0 6.83 13.6
## 2 1 1.05 2.82
# Create plot
email %>%
mutate(log_num_char = log( num_char ) ) %>%
ggplot(aes(group = spam, y = log_num_char)) +
geom_boxplot()
The median length of not-spam emails is greater than that of spam emails
# Compute center and spread for exclaim_mess by spam
email %>%
group_by( spam ) %>%
summarize( median( exclaim_mess ), IQR( exclaim_mess ) )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## spam `median(exclaim_mess)` `IQR(exclaim_mess)`
## <dbl> <dbl> <dbl>
## 1 0 1 5
## 2 1 0 1
# Create plot for spam and exclaim_mess
email %>%
mutate(log_exclaim_mess = log( exclaim_mess + 0.01) ) %>%
ggplot(aes(x = log_exclaim_mess, group = spam)) +
geom_density( alpha = 0.3 ) +
xlim( -10, 10 )
The most common values of exclaim_mess in both classes of email is zero (a log(exclaim_mess) of -4.6 after adding .01).
Even after a transformation, the distribution of exclaim_mess in both classes of email is right-skewed.
The typical number of exclamations in the not-spam group appears to be slightly higher than in the spam group
email %>%
mutate( zero = exclaim_mess == 0 ) %>%
ggplot( aes( x = zero, color = as.factor( spam ), fill = as.factor( spam ) ) ) +
geom_bar() +
facet_wrap( ~spam )
email %>% mutate( zero = exclaim_mess == 0 ) %>%
ggplot( aes( x = zero, fill = as.factor( spam ) ) ) +
geom_bar( position = 'fill' )
table( email$image )
##
## 0 1 2 3 4 5 9 20
## 3811 76 17 11 2 2 1 1
email %>%
mutate(has_image = image > 0 ) %>%
ggplot( aes( x = has_image, color = as.factor( spam ), fill = as.factor( spam ) ) ) +
geom_bar() +
facet_wrap( ~spam )
email %>%
mutate(has_image = image > 0 ) %>%
ggplot(aes(x = has_image, color = as.factor( spam ), fill = as.factor( spam ) )) +
geom_bar(position = 'fill')
# Question 1: For emails containing the word "dollar", does the typical spam email contain a greater number of occurrences of the word than the typical non-spam email? Create a summary statistic that answers this question.
email %>%
filter( dollar > 0 ) %>%
group_by(spam) %>%
summarize(median( dollar ))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## spam `median(dollar)`
## <dbl> <dbl>
## 1 0 4
## 2 1 2
# Question 2: If you encounter an email with greater than 10 occurrences of the word "dollar", is it more likely to be spam or not-spam? Create a barchart that answers this question.
email %>%
filter( dollar > 10 ) %>%
ggplot(aes(x = spam)) +
geom_bar()
Custom Ordering bars
email <- email %>%
mutate( zero = exclaim_mess ==0 )
#to set the order of zero, we need to convert to factors and specify the order
email$zero <- factor( email$zero, levels = c( 'TRUE', 'FALSE' ) )
email %>%
ggplot( aes( x = zero ) ) +
geom_bar() +
facet_wrap( ~spam )
# Reorder levels
email$number_reordered <- factor( email$number, levels = c( 'none', 'small', 'big' ) )
# Construct plot of number_reordered
ggplot(email, aes( x = number_reordered ) ) +
geom_bar() +
facet_wrap( ~spam )