1.2 Cleaning data
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(starwars)
# keep the variables name, height, and gender
newdata <- select(starwars, name, height, gender)
newdata
## # A tibble: 87 x 3
## name height gender
## <chr> <int> <chr>
## 1 Luke Skywalker 172 masculine
## 2 C-3PO 167 masculine
## 3 R2-D2 96 masculine
## 4 Darth Vader 202 masculine
## 5 Leia Organa 150 feminine
## 6 Owen Lars 178 masculine
## 7 Beru Whitesun lars 165 feminine
## 8 R5-D4 97 masculine
## 9 Biggs Darklighter 183 masculine
## 10 Obi-Wan Kenobi 182 masculine
## # ... with 77 more rows
# keep the variables name and all variables
# between mass and species inclusive
newdata <- select(starwars, name, mass:species)
newdata
## # A tibble: 87 x 10
## name mass hair_color skin_color eye_color birth_year sex gender homeworld
## <chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Luke~ 77 blond fair blue 19 male mascu~ Tatooine
## 2 C-3PO 75 <NA> gold yellow 112 none mascu~ Tatooine
## 3 R2-D2 32 <NA> white, bl~ red 33 none mascu~ Naboo
## 4 Dart~ 136 none white yellow 41.9 male mascu~ Tatooine
## 5 Leia~ 49 brown light brown 19 fema~ femin~ Alderaan
## 6 Owen~ 120 brown, gr~ light blue 52 male mascu~ Tatooine
## 7 Beru~ 75 brown light blue 47 fema~ femin~ Tatooine
## 8 R5-D4 32 <NA> white, red red NA none mascu~ Tatooine
## 9 Bigg~ 84 black light brown 24 male mascu~ Tatooine
## 10 Obi-~ 77 auburn, w~ fair blue-gray 57 male mascu~ Stewjon
## # ... with 77 more rows, and 1 more variable: species <chr>
newdata <- filter(starwars,
sex != "male")
newdata
## # A tibble: 23 x 14
## name height mass hair_color skin_color eye_color birth_year sex gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 C-3PO 167 75 <NA> gold yellow 112 none mascu~
## 2 R2-D2 96 32 <NA> white, bl~ red 33 none mascu~
## 3 Leia~ 150 49 brown light brown 19 fema~ femin~
## 4 Beru~ 165 75 brown light blue 47 fema~ femin~
## 5 R5-D4 97 32 <NA> white, red red NA none mascu~
## 6 Jabb~ 175 1358 <NA> green-tan~ orange 600 herm~ mascu~
## 7 IG-88 200 140 none metal red 15 none mascu~
## 8 Mon ~ 150 NA auburn fair blue 48 fema~ femin~
## 9 Shmi~ 163 NA black fair brown 72 fema~ femin~
## 10 Ayla~ 178 55 none blue hazel 48 fema~ femin~
## # ... with 13 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## # films <list>, vehicles <list>, starships <list>
newdata <- filter(starwars,
!homeworld %in% c("Alderaan", "Coruscant", "Endor"))
newdata
## # A tibble: 80 x 14
## name height mass hair_color skin_color eye_color birth_year sex gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Luke~ 172 77 blond fair blue 19 male mascu~
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu~
## 3 R2-D2 96 32 <NA> white, bl~ red 33 none mascu~
## 4 Dart~ 202 136 none white yellow 41.9 male mascu~
## 5 Owen~ 178 120 brown, gr~ light blue 52 male mascu~
## 6 Beru~ 165 75 brown light blue 47 fema~ femin~
## 7 R5-D4 97 32 <NA> white, red red NA none mascu~
## 8 Bigg~ 183 84 black light brown 24 male mascu~
## 9 Obi-~ 182 77 auburn, w~ fair blue-gray 57 male mascu~
## 10 Anak~ 188 84 blond fair blue 41.9 male mascu~
## # ... with 70 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## # films <list>, vehicles <list>, starships <list>
newdata <- mutate(starwars,
ht_inch = height * 0.394,
mass_pd = mass * 2.205)
select(newdata, name, ht_inch, mass_pd)
## # A tibble: 87 x 3
## name ht_inch mass_pd
## <chr> <dbl> <dbl>
## 1 Luke Skywalker 67.8 170.
## 2 C-3PO 65.8 165.
## 3 R2-D2 37.8 70.6
## 4 Darth Vader 79.6 300.
## 5 Leia Organa 59.1 108.
## 6 Owen Lars 70.1 265.
## 7 Beru Whitesun lars 65.0 165.
## 8 R5-D4 38.2 70.6
## 9 Biggs Darklighter 72.1 185.
## 10 Obi-Wan Kenobi 71.7 170.
## # ... with 77 more rows
newdata <- summarize(starwars,
mean_ht = mean(height, na.rm=TRUE),
mean_mass = mean(mass, na.rm=TRUE))
newdata
## # A tibble: 1 x 2
## mean_ht mean_mass
## <dbl> <dbl>
## 1 174. 97.3
newdata <- group_by(starwars, eye_color)
newdata <- summarize(newdata,
mean_ht = mean(height, na.rm=TRUE),
mean_wt = mean(mass, na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
newdata
## # A tibble: 15 x 3
## eye_color mean_ht mean_wt
## <chr> <dbl> <dbl>
## 1 black 185 76.3
## 2 blue 182. 86.5
## 3 blue-gray 182 77
## 4 brown 166. 66.1
## 5 dark NaN NaN
## 6 gold 191 NaN
## 7 green, yellow 216 159
## 8 hazel 174 66
## 9 orange 180. 282.
## 10 pink 180 NaN
## 11 red 155. 81.4
## 12 red, blue 96 NaN
## 13 unknown 136 31.5
## 14 white 178 48
## 15 yellow 178. 81.1
# calculate the mean height for women by species
newdata <- filter(starwars,
sex == "female")
newdata <- group_by(newdata, species)
newdata <- summarize(newdata,
mean_ht = mean(height, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
newdata
## # A tibble: 7 x 2
## species mean_ht
## <chr> <dbl>
## 1 Clawdite 168
## 2 Human 160.
## 3 Kaminoan 213
## 4 Mirialan 168
## 5 Tholothian 184
## 6 Togruta 178
## 7 Twi'lek 178
# this can be written as
newdata <- starwars %>%
filter(sex == "female") %>%
group_by(species) %>%
summarize(mean_ht = mean(height, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
newdata
## # A tibble: 7 x 2
## species mean_ht
## <chr> <dbl>
## 1 Clawdite 168
## 2 Human 160.
## 3 Kaminoan 213
## 4 Mirialan 168
## 5 Tholothian 184
## 6 Togruta 178
## 7 Twi'lek 178
library(readr)
# import data from a comma delimited file
wide_data <- read_csv("C:/Users/sclee1/OneDrive/Documents/R/wide_data.csv")
## Parsed with column specification:
## cols(
## id = col_double(),
## name = col_character(),
## sex = col_character(),
## age = col_double(),
## income = col_double()
## )
library(tidyr)
long_data <- gather(wide_data,
key="variable",
value="value",
sex:income)
long_data
## # A tibble: 9 x 4
## id name variable value
## <dbl> <chr> <chr> <chr>
## 1 1 Bill sex Male
## 2 2 Bob sex Male
## 3 3 Mary sex Female
## 4 1 Bill age 22
## 5 2 Bob age 25
## 6 3 Mary age 18
## 7 1 Bill income 55000
## 8 2 Bob income 75000
## 9 3 Mary income 90000
wide_data <- spread(long_data, variable, value)
wide_data
## # A tibble: 3 x 5
## id name age income sex
## <dbl> <chr> <chr> <chr> <chr>
## 1 1 Bill 22 55000 Male
## 2 2 Bob 25 75000 Male
## 3 3 Mary 18 90000 Female