Exploratory Data Analysis in R

DataCamp: Statistics with R

Bonnie Cooper

library( dplyr )
library( ggplot2 )
library( gapminder )
library( openintro )

Exploring Categorical Data

Comics Dataset

comicsurl <- 'https://assets.datacamp.com/production/repositories/537/datasets/8860af2c0ef67fc77a8c704a73bbb93a395debcf/comics.csv'
comics <- read.csv( comicsurl )
glimpse( comics )
## Rows: 23,272
## Columns: 11
## $ name         <chr> "Spider-Man (Peter Parker)", "Captain America (Steven Ro…
## $ id           <chr> "Secret", "Public", "Public", "Public", "No Dual", "Publ…
## $ align        <chr> "Good", "Good", "Neutral", "Good", "Good", "Good", "Good…
## $ eye          <chr> "Hazel Eyes", "Blue Eyes", "Blue Eyes", "Blue Eyes", "Bl…
## $ hair         <chr> "Brown Hair", "White Hair", "Black Hair", "Black Hair", …
## $ gender       <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male", …
## $ gsm          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ alive        <chr> "Living Characters", "Living Characters", "Living Charac…
## $ appearances  <int> 4043, 3360, 3061, 2961, 2258, 2255, 2072, 2017, 1955, 19…
## $ first_appear <chr> "Aug-62", "Mar-41", "Oct-74", "Mar-63", "Nov-50", "Nov-6…
## $ publisher    <chr> "marvel", "marvel", "marvel", "marvel", "marvel", "marve…
#contingency table
table( comics$id, comics$align )
##          
##            Bad Good Neutral Reformed Criminals
##   No Dual  474  647     390                  0
##   Public  2172 2930     965                  1
##   Secret  4493 2475     959                  1
##   Unknown    7    0       2                  0

Visualizing.
ggplot( data, aes( x=var1, fill=var2 ) ) + layer_name()

  1. the data (inferred if piped)
  2. the variables to be plotted
  3. layers built up in succession
ggplot( comics, aes( x=id, fill=align ) ) + geom_bar()

# Create side-by-side barchart of alignment by alignment
ggplot(comics, aes(x = align, fill = gender)) + 
  geom_bar(position= 'dodge' ) +
  theme(axis.text.x = element_text(angle = 30))

# Create side-by-side barchart of alignment by gender
ggplot(comics, aes(x = gender, fill = align)) + 
  geom_bar(position= 'dodge' ) +
  theme(axis.text.x = element_text(angle = 30))

Counts vs. Proportions

options( scipen = 999, digits = 3 )
#Allow the user to set and examine a variety of global options which affect the way in which R computes and displays its results.

tab_cnt <- table( comics$id, comics$align )
tab_prob <- prop.table( tab_cnt )
( tab_prob )
##          
##                 Bad      Good   Neutral Reformed Criminals
##   No Dual 0.0305491 0.0416989 0.0251353          0.0000000
##   Public  0.1399845 0.1888373 0.0621939          0.0000644
##   Secret  0.2895721 0.1595128 0.0618072          0.0000644
##   Unknown 0.0004511 0.0000000 0.0001289          0.0000000
sum( tab_prob )
## [1] 1

Conditional Proportions

prop.table( tab_cnt, 1 ) #Condition on the rows (every row adds to 1)
##          
##                Bad     Good  Neutral Reformed Criminals
##   No Dual 0.313700 0.428193 0.258107           0.000000
##   Public  0.357943 0.482861 0.159031           0.000165
##   Secret  0.566726 0.312185 0.120964           0.000126
##   Unknown 0.777778 0.000000 0.222222           0.000000
margin.table( tab_cnt, 1 ) #sum on the rows
## 
## No Dual  Public  Secret Unknown 
##    1511    6068    7928       9
margin.table( tab_cnt, 2) #sum on the columns
## 
##                Bad               Good            Neutral Reformed Criminals 
##               7146               6052               2316                  2
prop.table( tab_cnt, 2 ) #Condition on the columns (every column adds to 1)
##          
##                Bad     Good  Neutral Reformed Criminals
##   No Dual 0.066331 0.106907 0.168394           0.000000
##   Public  0.303946 0.484137 0.416667           0.500000
##   Secret  0.628743 0.408956 0.414076           0.500000
##   Unknown 0.000980 0.000000 0.000864           0.000000
#Conditional Bar Chart
#condition on id
ggplot( comics, aes( x=id, fill=align ) ) +
  geom_bar( position = 'fill' ) +
  ylab( 'proportion' )

#condition on alignment
ggplot( comics, aes( x=align, fill=id ) ) +
  geom_bar( position = 'fill' ) +
  ylab( 'proportion' )

tab <- table(comics$align, comics$gender)
options(scipen = 999, digits = 2) # Print fewer digits
prop.table(tab)     # Joint proportions
##                     
##                        Female     Male    Other
##   Bad                0.082197 0.395099 0.001672
##   Good               0.130114 0.251293 0.000888
##   Neutral            0.043685 0.094006 0.000888
##   Reformed Criminals 0.000052 0.000105 0.000000
prop.table(tab, 2)  # Conditional on columns
##                     
##                       Female    Male   Other
##   Bad                0.32102 0.53355 0.48485
##   Good               0.50816 0.33936 0.25758
##   Neutral            0.17061 0.12695 0.25758
##   Reformed Criminals 0.00020 0.00014 0.00000

Distribution of one variable

Marginal Distribution

table( comics$id )
## 
## No Dual  Public  Secret Unknown 
##    1788    6994    8698       9

Faceting: breaks the dataset down to levels among variables

ggplot( comics, aes( x=id ) ) +
  geom_bar() +
  facet_wrap( ~align ) +
  theme(axis.text.x = element_text(angle = 45))

it’s like an expansion of a stacked bar chart

# Change the order of the levels in align
comics <- comics %>% filter( align %in% c('Bad', 'Neutral', 'Good') )
comics$align <- factor(comics$align, 
                       levels = c('Bad', 'Neutral', 'Good') )

# Create plot of align
ggplot(comics, aes(x = align)) + 
  geom_bar()

# Change the order of the levels in align
comics <- comics %>% filter( gender %in% c('Female', 'Male') )
comics$gender <- factor(comics$gender, 
                       levels = c('Female', 'Male') )

# Create plot of align
ggplot(comics, aes(x = align)) + 
  geom_bar() +
  facet_wrap( ~gender )

Exploring Numerical Data

carsurl <- 'https://assets.datacamp.com/production/repositories/537/datasets/c0366d5da5ee8dce49919a5443685cf2e50c6a96/cars04.csv'
cars_df <- read.csv( carsurl )
str( cars_df )
## 'data.frame':    428 obs. of  19 variables:
##  $ name       : chr  "Chevrolet Aveo 4dr" "Chevrolet Aveo LS 4dr hatch" "Chevrolet Cavalier 2dr" "Chevrolet Cavalier 4dr" ...
##  $ sports_car : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ suv        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ wagon      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ minivan    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ pickup     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ all_wheel  : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ rear_wheel : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ msrp       : int  11690 12585 14610 14810 16385 13670 15040 13270 13730 15460 ...
##  $ dealer_cost: int  10965 11802 13697 13884 15357 12849 14086 12482 12906 14496 ...
##  $ eng_size   : num  1.6 1.6 2.2 2.2 2.2 2 2 2 2 2 ...
##  $ ncyl       : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ horsepwr   : int  103 103 140 140 140 132 132 130 110 130 ...
##  $ city_mpg   : int  28 28 26 26 26 29 29 26 27 26 ...
##  $ hwy_mpg    : int  34 34 37 37 37 36 36 33 36 33 ...
##  $ weight     : int  2370 2348 2617 2676 2617 2581 2626 2612 2606 2606 ...
##  $ wheel_base : int  98 98 104 104 104 105 105 103 103 103 ...
##  $ length     : int  167 153 183 183 183 174 174 168 168 168 ...
##  $ width      : int  66 66 69 68 69 67 67 67 67 67 ...

Dotplots

ggplot( cars_df, aes( x = weight ) ) +
  geom_dotplot( dotsize = 0.3 )
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bindot).

Histogram

ggplot( cars_df, aes( x = weight ) ) +
  geom_histogram( )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).

Density plot

ggplot( cars_df, aes( x=weight ) ) +
  geom_density()
## Warning: Removed 2 rows containing non-finite values (stat_density).

Boxplot

ggplot( cars_df, aes( x = 1, y = weight ) ) +
  geom_boxplot() +
  coord_flip()
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).

Faceted Histogram

ggplot( cars_df, aes( x = hwy_mpg ) ) +
  geom_histogram() +
  facet_wrap( ~pickup )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 14 rows containing non-finite values (stat_bin).

# Filter cars with 4, 6, 8 cylinders
common_cyl <- filter(cars_df, ncyl %in% c( 4, 6, 8 ))

# Create box plots of city mpg by ncyl
ggplot(common_cyl, aes(x = as.factor(ncyl), y = city_mpg)) +
  geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).

# Create overlaid density plots for same data
ggplot(common_cyl, aes(x = city_mpg, fill = as.factor( ncyl ))) +
  geom_density(alpha = .3)
## Warning: Removed 11 rows containing non-finite values (stat_density).

Distribution of one variable

building a data pipeline

cars_df %>% 
  filter( eng_size < 2.0 ) %>%
  ggplot( aes( hwy_mpg ) ) +
  geom_histogram( binwidth = 5 )

cars_df %>% 
  filter( eng_size < 2.0 ) %>%
  ggplot( aes( hwy_mpg ) ) +
  geom_density( bw = 5 )

# Create hist of horsepwr
cars_df %>%
  ggplot(aes(horsepwr)) +
  geom_histogram() +
  ggtitle('hist of horsepwr')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Create hist of horsepwr for affordable cars
cars_df %>% 
  filter(msrp < 25000) %>%
  ggplot(aes(horsepwr)) +
  geom_histogram() +
  xlim(c(90, 550)) +
  ggtitle('hist of horsepwr for affordable cars')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

# Create hist of horsepwr with binwidth of 3
cars_df %>%
  ggplot(aes(horsepwr)) +
  geom_histogram(binwidth = 3) +
  ggtitle('horsepwr with binwidth of 3')

# Create hist of horsepwr with binwidth of 30
cars_df %>%
  ggplot(aes(horsepwr)) +
  geom_histogram(binwidth = 30) +
  ggtitle('horsepwr with binwidth of 30')

# Create hist of horsepwr with binwidth of 60
cars_df %>%
  ggplot(aes(horsepwr)) +
  geom_histogram(binwidth = 60) +
  ggtitle('horsepwr with binwidth of 60')

Box Plots

ggplot( common_cyl, aes( as.factor( ncyl ), y = city_mpg ) ) +
  geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).

# Construct box plot of msrp
cars_df %>%
  ggplot(aes(x = 1, y = msrp)) +
  geom_boxplot()

# Exclude outliers from data
cars_no_out <- cars_df %>%
  filter(msrp < 100000)

# Construct box plot of msrp using the reduced dataset
cars_no_out %>%
  ggplot(aes(x = 1, y = msrp)) +
  geom_boxplot()

Visualization in higher dimensions

Facet Grid

ggplot( cars_df, aes( x = msrp ) ) +
  geom_density() +
  facet_grid( pickup ~ rear_wheel, labeller = label_both )

table( cars_df$rear_wheel, cars_df$pickup )
##        
##         FALSE TRUE
##   FALSE   306   12
##   TRUE     98   12
# Facet hists using hwy mileage and ncyl
common_cyl %>%
  ggplot(aes(x = hwy_mpg)) +
  geom_histogram() +
  facet_grid(ncyl ~ suv, labeller = label_both ) +
  ggtitle('Some title')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 11 rows containing non-finite values (stat_bin).

Measures of Center

life_url <- 'https://assets.datacamp.com/production/repositories/537/datasets/e079a96a639aa10afc478359da45f2f75f7efd2e/life_exp_raw.csv'
life <- read.csv( life_url )
glimpse( life )
## Rows: 81,691
## Columns: 10
## $ State                                    <chr> "Alabama", "Alabama", "Alaba…
## $ County                                   <chr> "Autauga County", "Baldwin C…
## $ fips                                     <int> 1001, 1003, 1005, 1007, 1009…
## $ Year                                     <int> 1985, 1985, 1985, 1985, 1985…
## $ Female.life.expectancy..years.           <dbl> 77, 79, 76, 77, 79, 75, 77, …
## $ Female.life.expectancy..national..years. <dbl> 78, 78, 78, 78, 78, 78, 78, …
## $ Female.life.expectancy..state..years.    <dbl> 77, 77, 77, 77, 77, 77, 77, …
## $ Male.life.expectancy..years.             <dbl> 68, 71, 67, 67, 71, 67, 68, …
## $ Male.life.expectancy..national..years.   <dbl> 71, 71, 71, 71, 71, 71, 71, …
## $ Male.life.expectancy..state..years.      <dbl> 69, 69, 69, 69, 69, 69, 69, …
colnames( life )
##  [1] "State"                                   
##  [2] "County"                                  
##  [3] "fips"                                    
##  [4] "Year"                                    
##  [5] "Female.life.expectancy..years."          
##  [6] "Female.life.expectancy..national..years."
##  [7] "Female.life.expectancy..state..years."   
##  [8] "Male.life.expectancy..years."            
##  [9] "Male.life.expectancy..national..years."  
## [10] "Male.life.expectancy..state..years."

Numerical Summaries

Median: robust. Mean: sensitive to extreme values

life <- life %>%
  mutate( west_coast = State %in% c( 'California', 'Oregon', 'Washington' ) ) %>%
  group_by( west_coast )  %>%
  summarise( mean( Female.life.expectancy..years.), 
             mean( Male.life.expectancy..years.),
             median( Female.life.expectancy..years. ), 
             median( Male.life.expectancy..years.))
## `summarise()` ungrouping output (override with `.groups` argument)
life
## # A tibble: 2 x 5
##   west_coast `mean(Female.li… `mean(Male.life… `median(Female.… `median(Male.li…
##   <lgl>                 <dbl>            <dbl>            <dbl>            <dbl>
## 1 FALSE                  78.7             72.6             78.8             72.8
## 2 TRUE                   79.6             74.4             79.5             74.3
data( gapminder )
glimpse( gapminder )
## Rows: 1,704
## Columns: 6
## $ country   <fct> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afghani…
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia,…
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997,…
## $ lifeExp   <dbl> 29, 30, 32, 34, 36, 38, 40, 41, 42, 42, 42, 44, 55, 59, 65,…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 1…
## $ gdpPercap <dbl> 779, 821, 853, 836, 740, 786, 978, 852, 649, 635, 727, 975,…
# Create dataset of 2007 data
gap2007 <- filter(gapminder, year == 2007)

# Compute groupwise mean and median lifeExp
gap2007 %>%
  group_by(continent) %>%
  summarize(mean( lifeExp ),
            median( lifeExp ) )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 3
##   continent `mean(lifeExp)` `median(lifeExp)`
##   <fct>               <dbl>             <dbl>
## 1 Africa               54.8              52.9
## 2 Americas             73.6              72.9
## 3 Asia                 70.7              72.4
## 4 Europe               77.6              78.6
## 5 Oceania              80.7              80.7
# Generate box plots of lifeExp for each continent
gap2007 %>%
  ggplot(aes(x = continent, y = lifeExp)) +
  geom_boxplot()

Visualization in higher dimensions

ggplot( cars_df, aes( x=msrp ) ) +
  geom_density() +
  facet_grid( pickup ~ rear_wheel ) #facel_grid( rows ~ columns )

Measures of Variability

Variance: take the sum of the squares of the distances of each measure in the group from the group mean. Then, divide the sum by the number of measurements - 1. Standard deviation: square rrot of the variance.

Standard deviation is pretty common to use, because it is a measure in the same unitsas the data.

#Standard deviation
sd( gapminder$lifeExp )
## [1] 13
#Variance
var( gapminder$lifeExp )
## [1] 167
#summary stats
summary( gapminder$lifeExp )
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      24      48      61      59      71      83
#inter quartile range
IQR( gapminder$lifeExp )
## [1] 23

IQR is a more robust measure of data spread than range; less impacted by outliers.

# Compute groupwise measures of spread
gap2007 %>%
  group_by(continent) %>%
  summarize(sd(lifeExp),
            IQR(lifeExp),
            n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 4
##   continent `sd(lifeExp)` `IQR(lifeExp)` `n()`
##   <fct>             <dbl>          <dbl> <int>
## 1 Africa            9.63          11.6      52
## 2 Americas          4.44           4.63     25
## 3 Asia              7.96          10.2      33
## 4 Europe            2.98           4.78     30
## 5 Oceania           0.729          0.516     2
# Generate overlaid density plots
gap2007 %>%
  ggplot(aes(x = lifeExp, fill = continent)) +
  geom_density(alpha = 0.3)

#based on the graphs, select the most appropriate measures of the data center & spread.

gap2007 %>%
  filter(continent == 'Americas') %>%
  ggplot( aes( lifeExp ) ) +
  geom_density()

# Compute stats for lifeExp in Americas
gap2007 %>%
  filter(continent == 'Americas') %>%
  summarize(mean( lifeExp ),
            sd( lifeExp ))
## # A tibble: 1 x 2
##   `mean(lifeExp)` `sd(lifeExp)`
##             <dbl>         <dbl>
## 1            73.6          4.44
ggplot( gap2007, aes( pop ) ) +
  geom_density()

# Compute stats for population
gap2007 %>%
  summarize(median( pop ),
            IQR( pop ))
## # A tibble: 1 x 2
##   `median(pop)` `IQR(pop)`
##           <dbl>      <dbl>
## 1      10517531  26702008.
# Create density plot of old variable
gap2007 %>%
  ggplot(aes(x = pop)) +
  geom_density()

# Transform the skewed pop variable
gap2007 <- gap2007 %>%
  mutate(log_pop = log( pop ))

# Create density plot of new variable
gap2007 %>%
  ggplot(aes(x = log_pop)) +
  geom_density()

Outliers

# Filter for Asia, add column indicating outliers
gap_asia <- gap2007 %>%
  filter(continent == 'Asia' ) %>%
  mutate(is_outlier = lifeExp < 50 )

# Remove outliers, create box plot of lifeExp
gap_asia %>%
  filter(!is_outlier) %>%
  ggplot(aes(x = 1, y = lifeExp)) +
  geom_boxplot()

Case Study

Email Data: What characteristics of an email are associated with it being spam?

glimpse( email )
## Rows: 3,921
## Columns: 21
## $ spam         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ to_multiple  <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ from         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ cc           <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 2, 1, 0, 2, 0,…
## $ sent_email   <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,…
## $ time         <dttm> 2012-01-01 01:16:41, 2012-01-01 02:03:59, 2012-01-01 11…
## $ image        <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ attach       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ dollar       <dbl> 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 5, 0, 0,…
## $ winner       <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, …
## $ inherit      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ viagra       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ password     <dbl> 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ num_char     <dbl> 11.37, 10.50, 7.77, 13.26, 1.23, 1.09, 4.84, 7.42, 3.05,…
## $ line_breaks  <int> 202, 202, 192, 255, 29, 25, 193, 237, 69, 68, 25, 79, 19…
## $ format       <dbl> 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,…
## $ re_subj      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,…
## $ exclaim_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ urgent_subj  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ exclaim_mess <dbl> 0, 1, 6, 48, 1, 1, 1, 18, 1, 0, 2, 1, 0, 10, 4, 10, 20, …
## $ number       <fct> big, small, small, small, none, none, big, small, small,…
# Compute summary statistics
email %>%
  group_by( spam ) %>%
  summarize( median( num_char ), IQR( num_char ) )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
##    spam `median(num_char)` `IQR(num_char)`
##   <dbl>              <dbl>           <dbl>
## 1     0               6.83           13.6 
## 2     1               1.05            2.82
# Create plot
email %>%
  mutate(log_num_char = log( num_char ) ) %>%
  ggplot(aes(group = spam, y = log_num_char)) +
  geom_boxplot()

The median length of not-spam emails is greater than that of spam emails

# Compute center and spread for exclaim_mess by spam
email %>%
  group_by( spam ) %>%
  summarize( median( exclaim_mess ), IQR( exclaim_mess ) )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
##    spam `median(exclaim_mess)` `IQR(exclaim_mess)`
##   <dbl>                  <dbl>               <dbl>
## 1     0                      1                   5
## 2     1                      0                   1
# Create plot for spam and exclaim_mess
email %>%
  mutate(log_exclaim_mess = log( exclaim_mess + 0.01) ) %>%
  ggplot(aes(x = log_exclaim_mess, group = spam)) +
  geom_density( alpha = 0.3 ) +
  xlim( -10, 10 )

The most common values of exclaim_mess in both classes of email is zero (a log(exclaim_mess) of -4.6 after adding .01).
Even after a transformation, the distribution of exclaim_mess in both classes of email is right-skewed.
The typical number of exclamations in the not-spam group appears to be slightly higher than in the spam group

email %>%
  mutate( zero = exclaim_mess == 0 ) %>%
  ggplot( aes( x = zero, color = as.factor( spam ), fill = as.factor( spam ) ) ) +
  geom_bar() +
  facet_wrap( ~spam )

email %>% mutate( zero = exclaim_mess == 0 ) %>%
  ggplot( aes( x = zero, fill = as.factor( spam ) ) ) +
  geom_bar( position = 'fill' )

table( email$image )
## 
##    0    1    2    3    4    5    9   20 
## 3811   76   17   11    2    2    1    1
email %>%
  mutate(has_image = image > 0 ) %>%
  ggplot( aes( x = has_image, color = as.factor( spam ), fill = as.factor( spam ) ) ) +
  geom_bar() +
  facet_wrap( ~spam )

email %>%
  mutate(has_image = image > 0 ) %>%
  ggplot(aes(x = has_image, color = as.factor( spam ), fill = as.factor( spam ) )) +
  geom_bar(position = 'fill')

# Question 1: For emails containing the word "dollar", does the typical spam email contain a greater number of occurrences of the word than the typical non-spam email? Create a summary statistic that answers this question.
email %>%
  filter( dollar > 0 ) %>%
  group_by(spam) %>%
  summarize(median( dollar ))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
##    spam `median(dollar)`
##   <dbl>            <dbl>
## 1     0                4
## 2     1                2
# Question 2: If you encounter an email with greater than 10 occurrences of the word "dollar", is it more likely to be spam or not-spam? Create a barchart that answers this question.
email %>%
  filter( dollar > 10 ) %>%
  ggplot(aes(x = spam)) +
  geom_bar()

Custom Ordering bars

email <- email %>%
  mutate( zero = exclaim_mess ==0 )
#to set the order of zero, we need to convert to factors and specify the order
email$zero <- factor( email$zero, levels = c( 'TRUE', 'FALSE' ) )
email %>%
  ggplot( aes( x = zero ) ) +
  geom_bar() +
  facet_wrap( ~spam )

# Reorder levels
email$number_reordered <- factor( email$number, levels = c( 'none', 'small', 'big' ) )

# Construct plot of number_reordered
ggplot(email, aes( x = number_reordered ) ) +
  geom_bar() +
  facet_wrap( ~spam )