Ch1: Data Preparation

1.1 Importing data

1.2 Cleaning data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data(starwars)

# keep the variables name, height, and gender
newdata <- select(starwars, name, height, gender)
newdata
## # A tibble: 87 x 3
##    name               height gender   
##    <chr>               <int> <chr>    
##  1 Luke Skywalker        172 masculine
##  2 C-3PO                 167 masculine
##  3 R2-D2                  96 masculine
##  4 Darth Vader           202 masculine
##  5 Leia Organa           150 feminine 
##  6 Owen Lars             178 masculine
##  7 Beru Whitesun lars    165 feminine 
##  8 R5-D4                  97 masculine
##  9 Biggs Darklighter     183 masculine
## 10 Obi-Wan Kenobi        182 masculine
## # ... with 77 more rows
# keep the variables name and all variables 
# between mass and species inclusive
newdata <- select(starwars, name, mass:species)
newdata
## # A tibble: 87 x 10
##    name   mass hair_color skin_color eye_color birth_year sex   gender homeworld
##    <chr> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr>  <chr>    
##  1 Luke~    77 blond      fair       blue            19   male  mascu~ Tatooine 
##  2 C-3PO    75 <NA>       gold       yellow         112   none  mascu~ Tatooine 
##  3 R2-D2    32 <NA>       white, bl~ red             33   none  mascu~ Naboo    
##  4 Dart~   136 none       white      yellow          41.9 male  mascu~ Tatooine 
##  5 Leia~    49 brown      light      brown           19   fema~ femin~ Alderaan 
##  6 Owen~   120 brown, gr~ light      blue            52   male  mascu~ Tatooine 
##  7 Beru~    75 brown      light      blue            47   fema~ femin~ Tatooine 
##  8 R5-D4    32 <NA>       white, red red             NA   none  mascu~ Tatooine 
##  9 Bigg~    84 black      light      brown           24   male  mascu~ Tatooine 
## 10 Obi-~    77 auburn, w~ fair       blue-gray       57   male  mascu~ Stewjon  
## # ... with 77 more rows, and 1 more variable: species <chr>
newdata <- filter(starwars, 
                  sex != "male")
newdata
## # A tibble: 23 x 14
##    name  height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 C-3PO    167    75 <NA>       gold       yellow           112 none  mascu~
##  2 R2-D2     96    32 <NA>       white, bl~ red               33 none  mascu~
##  3 Leia~    150    49 brown      light      brown             19 fema~ femin~
##  4 Beru~    165    75 brown      light      blue              47 fema~ femin~
##  5 R5-D4     97    32 <NA>       white, red red               NA none  mascu~
##  6 Jabb~    175  1358 <NA>       green-tan~ orange           600 herm~ mascu~
##  7 IG-88    200   140 none       metal      red               15 none  mascu~
##  8 Mon ~    150    NA auburn     fair       blue              48 fema~ femin~
##  9 Shmi~    163    NA black      fair       brown             72 fema~ femin~
## 10 Ayla~    178    55 none       blue       hazel             48 fema~ femin~
## # ... with 13 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## #   films <list>, vehicles <list>, starships <list>
newdata <- filter(starwars, 
                  !homeworld %in% c("Alderaan", "Coruscant", "Endor"))
newdata
## # A tibble: 80 x 14
##    name  height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Luke~    172    77 blond      fair       blue            19   male  mascu~
##  2 C-3PO    167    75 <NA>       gold       yellow         112   none  mascu~
##  3 R2-D2     96    32 <NA>       white, bl~ red             33   none  mascu~
##  4 Dart~    202   136 none       white      yellow          41.9 male  mascu~
##  5 Owen~    178   120 brown, gr~ light      blue            52   male  mascu~
##  6 Beru~    165    75 brown      light      blue            47   fema~ femin~
##  7 R5-D4     97    32 <NA>       white, red red             NA   none  mascu~
##  8 Bigg~    183    84 black      light      brown           24   male  mascu~
##  9 Obi-~    182    77 auburn, w~ fair       blue-gray       57   male  mascu~
## 10 Anak~    188    84 blond      fair       blue            41.9 male  mascu~
## # ... with 70 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## #   films <list>, vehicles <list>, starships <list>
newdata <- mutate(starwars, 
                  ht_inch = height * 0.394,
                  mass_pd   = mass   * 2.205)
select(newdata, name, ht_inch, mass_pd)
## # A tibble: 87 x 3
##    name               ht_inch mass_pd
##    <chr>                <dbl>   <dbl>
##  1 Luke Skywalker        67.8   170. 
##  2 C-3PO                 65.8   165. 
##  3 R2-D2                 37.8    70.6
##  4 Darth Vader           79.6   300. 
##  5 Leia Organa           59.1   108. 
##  6 Owen Lars             70.1   265. 
##  7 Beru Whitesun lars    65.0   165. 
##  8 R5-D4                 38.2    70.6
##  9 Biggs Darklighter     72.1   185. 
## 10 Obi-Wan Kenobi        71.7   170. 
## # ... with 77 more rows
newdata <- summarize(starwars, 
                     mean_ht = mean(height, na.rm=TRUE), 
                     mean_mass = mean(mass, na.rm=TRUE))
newdata
## # A tibble: 1 x 2
##   mean_ht mean_mass
##     <dbl>     <dbl>
## 1    174.      97.3
newdata <- group_by(starwars, eye_color)
newdata <- summarize(newdata, 
                     mean_ht = mean(height, na.rm=TRUE), 
                     mean_wt = mean(mass, na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
newdata
## # A tibble: 15 x 3
##    eye_color     mean_ht mean_wt
##    <chr>           <dbl>   <dbl>
##  1 black            185     76.3
##  2 blue             182.    86.5
##  3 blue-gray        182     77  
##  4 brown            166.    66.1
##  5 dark             NaN    NaN  
##  6 gold             191    NaN  
##  7 green, yellow    216    159  
##  8 hazel            174     66  
##  9 orange           180.   282. 
## 10 pink             180    NaN  
## 11 red              155.    81.4
## 12 red, blue         96    NaN  
## 13 unknown          136     31.5
## 14 white            178     48  
## 15 yellow           178.    81.1
# calculate the mean height for women by species
newdata <- filter(starwars, 
                  sex == "female")
newdata <- group_by(newdata, species)
newdata <- summarize(newdata, 
                     mean_ht = mean(height, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
newdata
## # A tibble: 7 x 2
##   species    mean_ht
##   <chr>        <dbl>
## 1 Clawdite      168 
## 2 Human         160.
## 3 Kaminoan      213 
## 4 Mirialan      168 
## 5 Tholothian    184 
## 6 Togruta       178 
## 7 Twi'lek       178
# this can be written as
newdata <- starwars %>%
  filter(sex == "female") %>%
  group_by(species) %>%
  summarize(mean_ht = mean(height, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
newdata
## # A tibble: 7 x 2
##   species    mean_ht
##   <chr>        <dbl>
## 1 Clawdite      168 
## 2 Human         160.
## 3 Kaminoan      213 
## 4 Mirialan      168 
## 5 Tholothian    184 
## 6 Togruta       178 
## 7 Twi'lek       178
library(readr)

# import data from a comma delimited file
wide_data <- read_csv("C:/Users/sclee1/OneDrive/Documents/R/wide_data.csv")
## Parsed with column specification:
## cols(
##   id = col_double(),
##   name = col_character(),
##   sex = col_character(),
##   age = col_double(),
##   income = col_double()
## )
library(tidyr)
long_data <- gather(wide_data, 
                    key="variable", 
                    value="value", 
                    sex:income)

long_data
## # A tibble: 9 x 4
##      id name  variable value 
##   <dbl> <chr> <chr>    <chr> 
## 1     1 Bill  sex      Male  
## 2     2 Bob   sex      Male  
## 3     3 Mary  sex      Female
## 4     1 Bill  age      22    
## 5     2 Bob   age      25    
## 6     3 Mary  age      18    
## 7     1 Bill  income   55000 
## 8     2 Bob   income   75000 
## 9     3 Mary  income   90000
wide_data <- spread(long_data, variable, value)
wide_data
## # A tibble: 3 x 5
##      id name  age   income sex   
##   <dbl> <chr> <chr> <chr>  <chr> 
## 1     1 Bill  22    55000  Male  
## 2     2 Bob   25    75000  Male  
## 3     3 Mary  18    90000  Female

Ch2: Introduction to ggplot2

Ch3: Univariate graphs