Exploring Categorical Data

> library(tidyverse)
> library(openintro)
> comics <- read.csv("C:\\Users\\pbj20\\Documents\\R Documents\\comics.csv")
> comics <- as_tibble(comics)
> head(comics)

# A tibble: 6 x 11
  name  id    align eye   hair  gender gsm   alive appearances first_appear
  <chr> <chr> <chr> <chr> <chr> <chr>  <chr> <chr>       <int> <chr>       
1 "Spi~ Secr~ Good  Haze~ Brow~ Male   <NA>  Livi~        4043 Aug-62      
2 "Cap~ Publ~ Good  Blue~ Whit~ Male   <NA>  Livi~        3360 Mar-41      
3 "Wol~ Publ~ Neut~ Blue~ Blac~ Male   <NA>  Livi~        3061 Oct-74      
4 "Iro~ Publ~ Good  Blue~ Blac~ Male   <NA>  Livi~        2961 Mar-63      
5 "Tho~ No D~ Good  Blue~ Blon~ Male   <NA>  Livi~        2258 Nov-50      
6 "Ben~ Publ~ Good  Blue~ No H~ Male   <NA>  Livi~        2255 Nov-61      
# ... with 1 more variable: publisher <chr>

> glimpse(comics)

Rows: 23,272
Columns: 11
$ name         <chr> "Spider-Man (Peter Parker)", "Captain America (Steven ...
$ id           <chr> "Secret", "Public", "Public", "Public", "No Dual", "Pu...
$ align        <chr> "Good", "Good", "Neutral", "Good", "Good", "Good", "Go...
$ eye          <chr> "Hazel Eyes", "Blue Eyes", "Blue Eyes", "Blue Eyes", "...
$ hair         <chr> "Brown Hair", "White Hair", "Black Hair", "Black Hair"...
$ gender       <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male"...
$ gsm          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ alive        <chr> "Living Characters", "Living Characters", "Living Char...
$ appearances  <int> 4043, 3360, 3061, 2961, 2258, 2255, 2072, 2017, 1955, ...
$ first_appear <chr> "Aug-62", "Mar-41", "Oct-74", "Mar-63", "Nov-50", "Nov...
$ publisher    <chr> "marvel", "marvel", "marvel", "marvel", "marvel", "mar...

> table(comics$id,comics$align)

         
           Bad Good Neutral Reformed Criminals
  No Dual  474  647     390                  0
  Public  2172 2930     965                  1
  Secret  4493 2475     959                  1
  Unknown    7    0       2                  0

> comics$align <- as.factor(comics$align)
> levels(comics$align)

[1] "Bad"                "Good"               "Neutral"           
[4] "Reformed Criminals"

> comics$name <- as.factor(comics$name)
> comics$id <- as.factor(comics$id)
> comics$eye <- as.factor(comics$eye)
> comics$hair <- as.factor(comics$hair)
> comics$gender <- as.factor(comics$gender)
> comics$alive <- as.factor(comics$alive)
> comics$first_appear <- as.factor(comics$first_appear)
> comics$publisher <- as.factor(comics$publisher)
> 
> levels(comics$id)

[1] "No Dual" "Public"  "Secret"  "Unknown"

> ggplot(comics,aes(x=id,fill=align))+geom_bar()

> # Print the first rows of the data
> head(comics)

# A tibble: 6 x 11
  name  id    align eye   hair  gender gsm   alive appearances first_appear
  <fct> <fct> <fct> <fct> <fct> <fct>  <chr> <fct>       <int> <fct>       
1 "Spi~ Secr~ Good  Haze~ Brow~ Male   <NA>  Livi~        4043 Aug-62      
2 "Cap~ Publ~ Good  Blue~ Whit~ Male   <NA>  Livi~        3360 Mar-41      
3 "Wol~ Publ~ Neut~ Blue~ Blac~ Male   <NA>  Livi~        3061 Oct-74      
4 "Iro~ Publ~ Good  Blue~ Blac~ Male   <NA>  Livi~        2961 Mar-63      
5 "Tho~ No D~ Good  Blue~ Blon~ Male   <NA>  Livi~        2258 Nov-50      
6 "Ben~ Publ~ Good  Blue~ No H~ Male   <NA>  Livi~        2255 Nov-61      
# ... with 1 more variable: publisher <fct>

> # Check levels of align
> levels(comics$align)

[1] "Bad"                "Good"               "Neutral"           
[4] "Reformed Criminals"

> # Check the levels of gender
> levels(comics$gender)

[1] "Female" "Male"   "Other"

> # Create a 2-way contingency table
> tab <- table(comics$align,comics$gender)
> 
> # Print tab
> print(tab)

                    
                     Female Male Other
  Bad                  1573 7561    32
  Good                 2490 4809    17
  Neutral               836 1799    17
  Reformed Criminals      1    2     0

> # Remove align level
> comics_filtered <- comics %>%
+   filter(align != "Reformed Criminals") %>%
+   droplevels()

> # See the result
> comics_filtered

# A tibble: 19,856 x 11
   name  id    align eye   hair  gender gsm   alive appearances first_appear
   <fct> <fct> <fct> <fct> <fct> <fct>  <chr> <fct>       <int> <fct>       
 1 "Spi~ Secr~ Good  Haze~ Brow~ Male   <NA>  Livi~        4043 Aug-62      
 2 "Cap~ Publ~ Good  Blue~ Whit~ Male   <NA>  Livi~        3360 Mar-41      
 3 "Wol~ Publ~ Neut~ Blue~ Blac~ Male   <NA>  Livi~        3061 Oct-74      
 4 "Iro~ Publ~ Good  Blue~ Blac~ Male   <NA>  Livi~        2961 Mar-63      
 5 "Tho~ No D~ Good  Blue~ Blon~ Male   <NA>  Livi~        2258 Nov-50      
 6 "Ben~ Publ~ Good  Blue~ No H~ Male   <NA>  Livi~        2255 Nov-61      
 7 "Ree~ Publ~ Good  Brow~ Brow~ Male   <NA>  Livi~        2072 Nov-61      
 8 "Hul~ Publ~ Good  Brow~ Brow~ Male   <NA>  Livi~        2017 May-62      
 9 "Sco~ Publ~ Neut~ Brow~ Brow~ Male   <NA>  Livi~        1955 Sep-63      
10 "Jon~ Publ~ Good  Blue~ Blon~ Male   <NA>  Livi~        1934 Nov-61      
# ... with 19,846 more rows, and 1 more variable: publisher <fct>

> # Create side-by-side barchart of gender by alignment
> ggplot(comics_filtered, aes(x = align, fill = gender)) + 
+   geom_bar(position = "dodge")

> # Create side-by-side barchart of alignment by gender
> ggplot(comics_filtered, aes(x = gender, fill = align)) + 
+   geom_bar(position = "dodge") +
+   theme(axis.text.x = element_text(angle = 90))

> options(scipen = 999, digits =3) #simplify display format
> tab_cnt <- table(comics_filtered$id, comics_filtered$align)
> tab_cnt

         
           Bad Good Neutral
  No Dual  474  647     390
  Public  2172 2930     965
  Secret  4493 2475     959
  Unknown    7    0       2

> prop.table(tab_cnt)

         
               Bad     Good  Neutral
  No Dual 0.030553 0.041704 0.025139
  Public  0.140003 0.188862 0.062202
  Secret  0.289609 0.159533 0.061815
  Unknown 0.000451 0.000000 0.000129

> sum(prop.table(tab_cnt))

[1] 1

> # each row = 100%
> prop.table(tab_cnt,1)

         
            Bad  Good Neutral
  No Dual 0.314 0.428   0.258
  Public  0.358 0.483   0.159
  Secret  0.567 0.312   0.121
  Unknown 0.778 0.000   0.222

> # each column = 100%
> prop.table(tab_cnt,2)

         
               Bad     Good  Neutral
  No Dual 0.066331 0.106907 0.168394
  Public  0.303946 0.484137 0.416667
  Secret  0.628743 0.408956 0.414076
  Unknown 0.000980 0.000000 0.000864

> ggplot(comics_filtered, aes(x = id, fill=align))+
+   geom_bar(position = "fill") + 
+   ylab("proportion")

> ggplot(comics_filtered, aes(x = align, fill=id))+
+   geom_bar(position = "fill") + 
+   ylab("proportion")

> tab <- table(comics_filtered$align, comics_filtered$gender)
> options(scipen = 999, digits = 3) # Print fewer digits
> prop.table(tab)     # Joint proportions

         
            Female     Male    Other
  Bad     0.082210 0.395160 0.001672
  Good    0.130135 0.251333 0.000888
  Neutral 0.043692 0.094021 0.000888

> prop.table(tab, 2)  # Conditional on columns

         
          Female  Male Other
  Bad      0.321 0.534 0.485
  Good     0.508 0.339 0.258
  Neutral  0.171 0.127 0.258

> # Plot of gender by align
> ggplot(comics_filtered, aes(x = align, fill = gender)) +
+   geom_bar()

> # Plot proportion of gender, conditional on align
> ggplot(comics_filtered, aes(x = align, fill = gender)) + 
+   geom_bar(position = "fill") +
+   ylab("proportion")

> table(comics$id)


No Dual  Public  Secret Unknown 
   1788    6994    8698       9

> ggplot(comics_filtered, aes(x=id)) + geom_bar() + 
+   facet_wrap(~align) +
+   theme(axis.text.x = element_text(angle = 90))

> # Change the order of the levels in align
> comics_filtered$align <- factor(comics_filtered$align, 
+                        levels = c("Bad", "Neutral", "Good"))
> 
> # Create plot of align
> ggplot(comics_filtered, aes(x = align)) + 
+   geom_bar()

> # Plot of alignment broken down by gender
> ggplot(comics_filtered, aes(x = align)) + 
+   geom_bar() +
+   facet_wrap(~ gender)

Exploring Numerical Data

> cars <- read_csv("C:\\Users\\pbj20\\Documents\\R Documents\\cars.csv")
> 
> glimpse(cars)

Rows: 428
Columns: 19
$ name        <chr> "Chevrolet Aveo 4dr", "Chevrolet Aveo LS 4dr hatch", "C...
$ sports_car  <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ suv         <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ wagon       <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ minivan     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ pickup      <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ all_wheel   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ rear_wheel  <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ msrp        <dbl> 11690, 12585, 14610, 14810, 16385, 13670, 15040, 13270,...
$ dealer_cost <dbl> 10965, 11802, 13697, 13884, 15357, 12849, 14086, 12482,...
$ eng_size    <dbl> 1.6, 1.6, 2.2, 2.2, 2.2, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...
$ ncyl        <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4...
$ horsepwr    <dbl> 103, 103, 140, 140, 140, 132, 132, 130, 110, 130, 130, ...
$ city_mpg    <dbl> 28, 28, 26, 26, 26, 29, 29, 26, 27, 26, 26, 32, 36, 32,...
$ hwy_mpg     <dbl> 34, 34, 37, 37, 37, 36, 36, 33, 36, 33, 33, 38, 44, 38,...
$ weight      <dbl> 2370, 2348, 2617, 2676, 2617, 2581, 2626, 2612, 2606, 2...
$ wheel_base  <dbl> 98, 98, 104, 104, 104, 105, 105, 103, 103, 103, 103, 10...
$ length      <dbl> 167, 153, 183, 183, 183, 174, 174, 168, 168, 168, 168, ...
$ width       <dbl> 66, 66, 69, 68, 69, 67, 67, 67, 67, 67, 67, 67, 67, 68,...

> # Learn data structure
> str(cars)

tibble [428 x 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ name       : chr [1:428] "Chevrolet Aveo 4dr" "Chevrolet Aveo LS 4dr hatch" "Chevrolet Cavalier 2dr" "Chevrolet Cavalier 4dr" ...
 $ sports_car : logi [1:428] FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ suv        : logi [1:428] FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ wagon      : logi [1:428] FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ minivan    : logi [1:428] FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ pickup     : logi [1:428] FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ all_wheel  : logi [1:428] FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ rear_wheel : logi [1:428] FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ msrp       : num [1:428] 11690 12585 14610 14810 16385 ...
 $ dealer_cost: num [1:428] 10965 11802 13697 13884 15357 ...
 $ eng_size   : num [1:428] 1.6 1.6 2.2 2.2 2.2 2 2 2 2 2 ...
 $ ncyl       : num [1:428] 4 4 4 4 4 4 4 4 4 4 ...
 $ horsepwr   : num [1:428] 103 103 140 140 140 132 132 130 110 130 ...
 $ city_mpg   : num [1:428] 28 28 26 26 26 29 29 26 27 26 ...
 $ hwy_mpg    : num [1:428] 34 34 37 37 37 36 36 33 36 33 ...
 $ weight     : num [1:428] 2370 2348 2617 2676 2617 ...
 $ wheel_base : num [1:428] 98 98 104 104 104 105 105 103 103 103 ...
 $ length     : num [1:428] 167 153 183 183 183 174 174 168 168 168 ...
 $ width      : num [1:428] 66 66 69 68 69 67 67 67 67 67 ...
 - attr(*, "spec")=
  .. cols(
  ..   name = col_character(),
  ..   sports_car = col_logical(),
  ..   suv = col_logical(),
  ..   wagon = col_logical(),
  ..   minivan = col_logical(),
  ..   pickup = col_logical(),
  ..   all_wheel = col_logical(),
  ..   rear_wheel = col_logical(),
  ..   msrp = col_double(),
  ..   dealer_cost = col_double(),
  ..   eng_size = col_double(),
  ..   ncyl = col_double(),
  ..   horsepwr = col_double(),
  ..   city_mpg = col_double(),
  ..   hwy_mpg = col_double(),
  ..   weight = col_double(),
  ..   wheel_base = col_double(),
  ..   length = col_double(),
  ..   width = col_double()
  .. )

> # Create faceted histogram
> ggplot(cars, aes(x = city_mpg)) +
+   geom_histogram() +
+   facet_wrap(~ suv)

> unique(cars$ncyl)

[1]  4  6  3  8  5 12 10 -1

> # Filter cars with 4, 6, 8 cylinders
> common_cyl <- filter(cars, ncyl %in% c(4, 6, 8))
> 
> # Create box plots of city mpg by ncyl
> ggplot(common_cyl, aes(x = as.factor(ncyl), y = city_mpg)) +
+   geom_boxplot()

> # Create overlaid density plots for same data
> ggplot(common_cyl, aes(x = city_mpg, fill = as.factor(ncyl))) +
+   geom_density(alpha = .3)

> cars %>% filter(eng_size < 2.0) %>% 
+   ggplot(aes(x=hwy_mpg)) + 
+   geom_histogram()

> cars %>% filter(eng_size < 2.0) %>% 
+   ggplot(aes(x=hwy_mpg)) + 
+   geom_histogram(binwidth = 5)

> cars %>% filter(eng_size < 2.0) %>% 
+   ggplot(aes(x=hwy_mpg)) + 
+   geom_density(bw = 5)

> # Create hist of horsepwr
> cars %>%
+   ggplot(aes(x=horsepwr)) +
+   geom_histogram() +
+   ggtitle("Hist of Horsepwr")

> # Create hist of horsepwr for affordable cars
> cars %>% 
+   filter(msrp<25000) %>%
+   ggplot(aes(x=horsepwr)) +
+   geom_histogram() +
+   xlim(c(90, 550)) +
+   ggtitle("Hist of Horsepwr")

> # Create hist of horsepwr with binwidth of 3
> cars %>%
+   ggplot(aes(x=horsepwr)) +
+   geom_histogram(binwidth = 3) +
+   ggtitle("Hist bin 3")

> # Create hist of horsepwr with binwidth of 30
> cars %>%
+   ggplot(aes(x=horsepwr)) +
+   geom_histogram(binwidth = 30) +
+   ggtitle("Hist bin 30")

> # Create hist of horsepwr with binwidth of 60
> cars %>%
+   ggplot(aes(x=horsepwr)) +
+   geom_histogram(binwidth = 60) +
+   ggtitle("Hist bin 60")

> # Use x=1 for histogram with 1 x variable
> 
> # Construct box plot of msrp
> cars %>%
+   ggplot(aes(x = 1, y = msrp)) +
+   geom_boxplot()

> # Exclude outliers from data
> cars_no_out <- cars %>%
+   filter(msrp<100000)
> 
> # Construct box plot of msrp using the reduced dataset
> cars_no_out %>%
+   ggplot(aes(x = 1, y = msrp)) +
+   geom_boxplot()

> # Create plot of city_mpg
> cars %>%
+   ggplot(aes(x=1, y=city_mpg)) +
+   geom_boxplot()

> cars %>%
+   ggplot(aes(x=city_mpg)) +
+   geom_density()

> # Create plot of width
> cars %>%
+   ggplot(aes(x=1, y=width)) +
+   geom_boxplot()

> cars %>%
+   ggplot(aes(x=width)) +
+   geom_density()

> ggplot(cars, aes(x=msrp))+
+   geom_density()+
+   facet_grid(pickup ~ rear_wheel)

> #pickup rows, rear_wheel columns
> 
> ggplot(cars, aes(x=msrp))+
+   geom_density()+
+   facet_grid(pickup ~ rear_wheel, labeller = label_both)

> table(wheel = cars$rear_wheel, pickup = cars$pickup)

       pickup
wheel   FALSE TRUE
  FALSE   306   12
  TRUE     98   12

> # Facet hists using hwy mileage and ncyl
> common_cyl %>%
+   ggplot(aes(x = hwy_mpg)) +
+   geom_histogram() +
+   facet_grid(ncyl ~ suv) +
+   ggtitle("By ncyl and suv")

Numerical Summaries

> library(gapminder)
> head(gapminder)

# A tibble: 6 x 6
  country     continent  year lifeExp      pop gdpPercap
  <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
1 Afghanistan Asia       1952    28.8  8425333      779.
2 Afghanistan Asia       1957    30.3  9240934      821.
3 Afghanistan Asia       1962    32.0 10267083      853.
4 Afghanistan Asia       1967    34.0 11537966      836.
5 Afghanistan Asia       1972    36.1 13079460      740.
6 Afghanistan Asia       1977    38.4 14880372      786.

> # Create dataset of 2007 data
> gap2007 <- filter(gapminder, year == 2007)
> 
> # Compute groupwise mean and median lifeExp
> gap2007 %>%
+   group_by(continent) %>%
+   summarize(mean(lifeExp),
+             median(lifeExp))

# A tibble: 5 x 3
  continent `mean(lifeExp)` `median(lifeExp)`
  <fct>               <dbl>             <dbl>
1 Africa               54.8              52.9
2 Americas             73.6              72.9
3 Asia                 70.7              72.4
4 Europe               77.6              78.6
5 Oceania              80.7              80.7

> # Generate box plots of lifeExp for each continent
> gap2007 %>%
+   ggplot(aes(x = continent, y = lifeExp)) +
+   geom_boxplot()

> # Compute groupwise measures of spread
> gap2007 %>%
+   group_by(continent) %>%
+   summarize(sd(lifeExp),
+             IQR(lifeExp),
+             n())

# A tibble: 5 x 4
  continent `sd(lifeExp)` `IQR(lifeExp)` `n()`
  <fct>             <dbl>          <dbl> <int>
1 Africa            9.63          11.6      52
2 Americas          4.44           4.63     25
3 Asia              7.96          10.2      33
4 Europe            2.98           4.78     30
5 Oceania           0.729          0.516     2

> # Generate overlaid density plots
> gap2007 %>%
+   ggplot(aes(x = lifeExp, fill = continent)) +
+   geom_density(alpha = 0.3)

> # Compute stats for lifeExp in Americas
> gap2007 %>%
+   filter(continent=="Americas") %>%
+   summarize(mean(lifeExp),
+             sd(lifeExp))

# A tibble: 1 x 2
  `mean(lifeExp)` `sd(lifeExp)`
            <dbl>         <dbl>
1            73.6          4.44

> # Compute stats for population
> gap2007 %>%
+   summarize(median(pop),
+             IQR(pop))

# A tibble: 1 x 2
  `median(pop)` `IQR(pop)`
          <dbl>      <dbl>
1      10517531  26702008.

> # Create density plot of old variable
> gap2007 %>%
+   ggplot(aes(x = pop)) +
+   geom_density()

> # Transform the skewed pop variable
> gap2007 <- gap2007 %>%
+   mutate(log_pop = log(pop))
> 
> # Create density plot of new variable
> gap2007 %>%
+   ggplot(aes(x = log_pop)) +
+   geom_density()

> # Filter for Asia, add column indicating outliers
> gap_asia <- gap2007 %>%
+   filter(continent == "Asia") %>%
+   mutate(is_outlier = lifeExp <50)
> 
> # Remove outliers, create box plot of lifeExp
> gap_asia %>%
+   filter(!is_outlier) %>%
+   ggplot(aes(x = 1, y = lifeExp)) +
+   geom_boxplot()

Case Study

> head(email)

# A tibble: 6 x 21
   spam to_multiple  from    cc sent_email time                image attach
  <dbl>       <dbl> <dbl> <int>      <dbl> <dttm>              <dbl>  <dbl>
1     0           0     1     0          0 2012-01-01 01:16:41     0      0
2     0           0     1     0          0 2012-01-01 02:03:59     0      0
3     0           0     1     0          0 2012-01-01 11:00:32     0      0
4     0           0     1     0          0 2012-01-01 04:09:49     0      0
5     0           0     1     0          0 2012-01-01 05:00:01     0      0
6     0           0     1     0          0 2012-01-01 05:04:46     0      0
# ... with 13 more variables: dollar <dbl>, winner <fct>, inherit <dbl>,
#   viagra <dbl>, password <dbl>, num_char <dbl>, line_breaks <int>,
#   format <dbl>, re_subj <dbl>, exclaim_subj <dbl>, urgent_subj <dbl>,
#   exclaim_mess <dbl>, number <fct>

> glimpse(email)

Rows: 3,921
Columns: 21
$ spam         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ to_multiple  <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ from         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ cc           <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 2, 1, 0, 2, ...
$ sent_email   <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, ...
$ time         <dttm> 2012-01-01 01:16:41, 2012-01-01 02:03:59, 2012-01-01 ...
$ image        <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ attach       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ dollar       <dbl> 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 5, 0, ...
$ winner       <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no...
$ inherit      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ viagra       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ password     <dbl> 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
$ num_char     <dbl> 11.370, 10.504, 7.773, 13.256, 1.231, 1.091, 4.837, 7....
$ line_breaks  <int> 202, 202, 192, 255, 29, 25, 193, 237, 69, 68, 25, 79, ...
$ format       <dbl> 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...
$ re_subj      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, ...
$ exclaim_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
$ urgent_subj  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ exclaim_mess <dbl> 0, 1, 6, 48, 1, 1, 1, 18, 1, 0, 2, 1, 0, 10, 4, 10, 20...
$ number       <fct> big, small, small, small, none, none, big, small, smal...

> # Compute summary statistics
> email %>%
+   group_by(spam) %>%
+   summarize(median(num_char),
+             IQR(num_char))

# A tibble: 2 x 3
   spam `median(num_char)` `IQR(num_char)`
  <dbl>              <dbl>           <dbl>
1     0               6.83           13.6 
2     1               1.05            2.82

> # Create plot
> email %>%
+   mutate(log_num_char = log(num_char)) %>%
+   ggplot(aes(x = spam, y = log_num_char)) +
+   geom_boxplot()

> # Compute center and spread for exclaim_mess by spam
> email %>%
+   group_by(spam) %>%
+   summarize(median(exclaim_mess),
+             IQR(exclaim_mess))

# A tibble: 2 x 3
   spam `median(exclaim_mess)` `IQR(exclaim_mess)`
  <dbl>                  <dbl>               <dbl>
1     0                      1                   5
2     1                      0                   1

> # Create plot for spam and exclaim_mess
> email %>%
+   mutate(log_exclaim_mess = log(exclaim_mess + 0.01)) %>%
+   ggplot(aes(x = log_exclaim_mess)) +
+   geom_histogram() +
+   facet_wrap(~ spam)

> # Alternative plot: side-by-side box plots
> email %>%
+   mutate(log_exclaim_mess = log(exclaim_mess + 0.01)) %>%
+   ggplot(aes(x = 1, y = log_exclaim_mess)) +
+   geom_boxplot() +
+   facet_wrap(~ spam)

> # Alternative plot: Overlaid density plots
> email %>%
+   mutate(log_exclaim_mess = log(exclaim_mess + .01)) %>%
+   ggplot(aes(x = log_exclaim_mess, fill = spam)) +
+   geom_density(alpha = 0.3)

> table(email$image)


   0    1    2    3    4    5    9   20 
3811   76   17   11    2    2    1    1

> # Create plot of proportion of spam by image
> email %>%
+   mutate(has_image = image > 0 ) %>%
+   ggplot(aes(x = has_image, fill = spam)) +
+   geom_bar(position = "fill")

> sum(email$num_char < 0 )

[1] 0

> # Test if images count as attachments
> sum(email$image > email$attach)

[1] 0

> # Question 1
> email %>%
+   filter(dollar > 0) %>%
+   group_by(spam) %>%
+   summarize(n())

# A tibble: 2 x 2
   spam `n()`
  <dbl> <int>
1     0   668
2     1    78

> # Question 2
> email %>%
+   filter(dollar >10) %>%
+   ggplot(aes(x = spam)) +
+   geom_bar()

> # Reorder levels
> email$number_reordered <- factor(email$number, levels = c("none", "small", "big"))
> 
> # Construct plot of number_reordered
> ggplot(email, aes(x = number_reordered)) +
+   geom_bar() +
+   facet_wrap(~ spam)

Basic Exploratory Data Analysis in R

Produced in R Markdown

Paul Jozefek

2019

Exploring Categorical Data

Exploring Numerical Data

Numerical Summaries

Case Study