1.2 Cleaning Data
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(starwars)
# keep the variables name, height, and gender
newdata <- select(starwars, name, height, gender)
library(dplyr)
data(starwars)
# keep the variables name and all variables
# between mass and species inclusive
newdata <- select(starwars, name, mass:species)
newdata
## # A tibble: 87 x 10
## name mass hair_color skin_color eye_color birth_year sex gender homeworld
## <chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Luke~ 77 blond fair blue 19 male mascu~ Tatooine
## 2 C-3PO 75 <NA> gold yellow 112 none mascu~ Tatooine
## 3 R2-D2 32 <NA> white, bl~ red 33 none mascu~ Naboo
## 4 Dart~ 136 none white yellow 41.9 male mascu~ Tatooine
## 5 Leia~ 49 brown light brown 19 fema~ femin~ Alderaan
## 6 Owen~ 120 brown, gr~ light blue 52 male mascu~ Tatooine
## 7 Beru~ 75 brown light blue 47 fema~ femin~ Tatooine
## 8 R5-D4 32 <NA> white, red red NA none mascu~ Tatooine
## 9 Bigg~ 84 black light brown 24 male mascu~ Tatooine
## 10 Obi-~ 77 auburn, w~ fair blue-gray 57 male mascu~ Stewjon
## # ... with 77 more rows, and 1 more variable: species <chr>
# keep all variables except birth_year and gender
newdata <- select(starwars, -birth_year, -gender)
newdata
## # A tibble: 87 x 12
## name height mass hair_color skin_color eye_color sex homeworld species
## <chr> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Luke~ 172 77 blond fair blue male Tatooine Human
## 2 C-3PO 167 75 <NA> gold yellow none Tatooine Droid
## 3 R2-D2 96 32 <NA> white, bl~ red none Naboo Droid
## 4 Dart~ 202 136 none white yellow male Tatooine Human
## 5 Leia~ 150 49 brown light brown fema~ Alderaan Human
## 6 Owen~ 178 120 brown, gr~ light blue male Tatooine Human
## 7 Beru~ 165 75 brown light blue fema~ Tatooine Human
## 8 R5-D4 97 32 <NA> white, red red none Tatooine Droid
## 9 Bigg~ 183 84 black light brown male Tatooine Human
## 10 Obi-~ 182 77 auburn, w~ fair blue-gray male Stewjon Human
## # ... with 77 more rows, and 3 more variables: films <list>, vehicles <list>,
## # starships <list>
# select females
newdata <- filter(starwars,
sex != "female")
newdata
## # A tibble: 67 x 14
## name height mass hair_color skin_color eye_color birth_year sex gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Luke~ 172 77 blond fair blue 19 male mascu~
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu~
## 3 R2-D2 96 32 <NA> white, bl~ red 33 none mascu~
## 4 Dart~ 202 136 none white yellow 41.9 male mascu~
## 5 Owen~ 178 120 brown, gr~ light blue 52 male mascu~
## 6 R5-D4 97 32 <NA> white, red red NA none mascu~
## 7 Bigg~ 183 84 black light brown 24 male mascu~
## 8 Obi-~ 182 77 auburn, w~ fair blue-gray 57 male mascu~
## 9 Anak~ 188 84 blond fair blue 41.9 male mascu~
## 10 Wilh~ 180 NA auburn, g~ fair blue 64 male mascu~
## # ... with 57 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## # films <list>, vehicles <list>, starships <list>
# this can be written more succinctly as
newdata <- filter(starwars,
!homeworld %in% c("Alderaan", "Coruscant", "Endor"))
newdata
## # A tibble: 80 x 14
## name height mass hair_color skin_color eye_color birth_year sex gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Luke~ 172 77 blond fair blue 19 male mascu~
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu~
## 3 R2-D2 96 32 <NA> white, bl~ red 33 none mascu~
## 4 Dart~ 202 136 none white yellow 41.9 male mascu~
## 5 Owen~ 178 120 brown, gr~ light blue 52 male mascu~
## 6 Beru~ 165 75 brown light blue 47 fema~ femin~
## 7 R5-D4 97 32 <NA> white, red red NA none mascu~
## 8 Bigg~ 183 84 black light brown 24 male mascu~
## 9 Obi-~ 182 77 auburn, w~ fair blue-gray 57 male mascu~
## 10 Anak~ 188 84 blond fair blue 41.9 male mascu~
## # ... with 70 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## # films <list>, vehicles <list>, starships <list>
# convert height in centimeters to inches,
# and mass in kilograms to pounds
newdata <- mutate(starwars,
height = height * 0.394,
mass = mass * 2.205)
newdata
## # A tibble: 87 x 14
## name height mass hair_color skin_color eye_color birth_year sex gender
## <chr> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Luke~ 67.8 170. blond fair blue 19 male mascu~
## 2 C-3PO 65.8 165. <NA> gold yellow 112 none mascu~
## 3 R2-D2 37.8 70.6 <NA> white, bl~ red 33 none mascu~
## 4 Dart~ 79.6 300. none white yellow 41.9 male mascu~
## 5 Leia~ 59.1 108. brown light brown 19 fema~ femin~
## 6 Owen~ 70.1 265. brown, gr~ light blue 52 male mascu~
## 7 Beru~ 65.0 165. brown light blue 47 fema~ femin~
## 8 R5-D4 38.2 70.6 <NA> white, red red NA none mascu~
## 9 Bigg~ 72.1 185. black light brown 24 male mascu~
## 10 Obi-~ 71.7 170. auburn, w~ fair blue-gray 57 male mascu~
## # ... with 77 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## # films <list>, vehicles <list>, starships <list>
library(dplyr)
# if height is greater than 180
# then heightcat = "tall",
# otherwise heightcat = "short"
newdata <- mutate(starwars,
heightcat = ifelse(height > 180,
"tall",
"short"))
newdata
## # A tibble: 87 x 15
## name height mass hair_color skin_color eye_color birth_year sex gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Luke~ 172 77 blond fair blue 19 male mascu~
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu~
## 3 R2-D2 96 32 <NA> white, bl~ red 33 none mascu~
## 4 Dart~ 202 136 none white yellow 41.9 male mascu~
## 5 Leia~ 150 49 brown light brown 19 fema~ femin~
## 6 Owen~ 178 120 brown, gr~ light blue 52 male mascu~
## 7 Beru~ 165 75 brown light blue 47 fema~ femin~
## 8 R5-D4 97 32 <NA> white, red red NA none mascu~
## 9 Bigg~ 183 84 black light brown 24 male mascu~
## 10 Obi-~ 182 77 auburn, w~ fair blue-gray 57 male mascu~
## # ... with 77 more rows, and 6 more variables: homeworld <chr>, species <chr>,
## # films <list>, vehicles <list>, starships <list>, heightcat <chr>
# convert any eye color that is not
# black, blue or brown, to other
newdata <- mutate(starwars,
eye_color = ifelse(eye_color %in% c("black", "blue", "brown"),
eye_color,
"other"))
newdata
## # A tibble: 87 x 14
## name height mass hair_color skin_color eye_color birth_year sex gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Luke~ 172 77 blond fair blue 19 male mascu~
## 2 C-3PO 167 75 <NA> gold other 112 none mascu~
## 3 R2-D2 96 32 <NA> white, bl~ other 33 none mascu~
## 4 Dart~ 202 136 none white other 41.9 male mascu~
## 5 Leia~ 150 49 brown light brown 19 fema~ femin~
## 6 Owen~ 178 120 brown, gr~ light blue 52 male mascu~
## 7 Beru~ 165 75 brown light blue 47 fema~ femin~
## 8 R5-D4 97 32 <NA> white, red other NA none mascu~
## 9 Bigg~ 183 84 black light brown 24 male mascu~
## 10 Obi-~ 182 77 auburn, w~ fair other 57 male mascu~
## # ... with 77 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## # films <list>, vehicles <list>, starships <list>
# set heights greater than 200 or
# less than 75 to missing
newdata <- mutate(starwars,
height = ifelse(height < 75 | height > 200,
NA,
height))
newdata
## # A tibble: 87 x 14
## name height mass hair_color skin_color eye_color birth_year sex gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Luke~ 172 77 blond fair blue 19 male mascu~
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu~
## 3 R2-D2 96 32 <NA> white, bl~ red 33 none mascu~
## 4 Dart~ NA 136 none white yellow 41.9 male mascu~
## 5 Leia~ 150 49 brown light brown 19 fema~ femin~
## 6 Owen~ 178 120 brown, gr~ light blue 52 male mascu~
## 7 Beru~ 165 75 brown light blue 47 fema~ femin~
## 8 R5-D4 97 32 <NA> white, red red NA none mascu~
## 9 Bigg~ 183 84 black light brown 24 male mascu~
## 10 Obi-~ 182 77 auburn, w~ fair blue-gray 57 male mascu~
## # ... with 77 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## # films <list>, vehicles <list>, starships <list>
newdata <- mutate(starwars,
height = height * 0.394,
mass = mass * 2.205)
select(newdata, height, mass)
## # A tibble: 87 x 2
## height mass
## <dbl> <dbl>
## 1 67.8 170.
## 2 65.8 165.
## 3 37.8 70.6
## 4 79.6 300.
## 5 59.1 108.
## 6 70.1 265.
## 7 65.0 165.
## 8 38.2 70.6
## 9 72.1 185.
## 10 71.7 170.
## # ... with 77 more rows
newdata
## # A tibble: 87 x 14
## name height mass hair_color skin_color eye_color birth_year sex gender
## <chr> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Luke~ 67.8 170. blond fair blue 19 male mascu~
## 2 C-3PO 65.8 165. <NA> gold yellow 112 none mascu~
## 3 R2-D2 37.8 70.6 <NA> white, bl~ red 33 none mascu~
## 4 Dart~ 79.6 300. none white yellow 41.9 male mascu~
## 5 Leia~ 59.1 108. brown light brown 19 fema~ femin~
## 6 Owen~ 70.1 265. brown, gr~ light blue 52 male mascu~
## 7 Beru~ 65.0 165. brown light blue 47 fema~ femin~
## 8 R5-D4 38.2 70.6 <NA> white, red red NA none mascu~
## 9 Bigg~ 72.1 185. black light brown 24 male mascu~
## 10 Obi-~ 71.7 170. auburn, w~ fair blue-gray 57 male mascu~
## # ... with 77 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## # films <list>, vehicles <list>, starships <list>
# calculate mean height and mass
newdata <- summarize(starwars,
mean_ht = mean(height, na.rm=TRUE),
mean_mass = mean(mass, na.rm=TRUE))
newdata
## # A tibble: 1 x 2
## mean_ht mean_mass
## <dbl> <dbl>
## 1 174. 97.3
# calculate mean height and weight by gender
newdata <- group_by(starwars, sex)
newdata <- summarize(newdata,
mean_ht = mean(height, na.rm=TRUE),
mean_wt = mean(mass, na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
newdata
## # A tibble: 5 x 3
## sex mean_ht mean_wt
## <chr> <dbl> <dbl>
## 1 female 169. 54.7
## 2 hermaphroditic 175 1358
## 3 male 179. 81.0
## 4 none 131. 69.8
## 5 <NA> 181. 48
newdata <- starwars %>%
filter(sex == "female") %>%
group_by(species) %>%
summarize(mean_ht = mean(height, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
newdata
## # A tibble: 7 x 2
## species mean_ht
## <chr> <dbl>
## 1 Clawdite 168
## 2 Human 160.
## 3 Kaminoan 213
## 4 Mirialan 168
## 5 Tholothian 184
## 6 Togruta 178
## 7 Twi'lek 178
library(readr)
## Warning: package 'readr' was built under R version 4.0.3
# import data from a comma delimited file
wide_data <- read_csv("wide_data.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## id = col_double(),
## name = col_character(),
## sex = col_character(),
## age = col_double(),
## income = col_double()
## )
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.3
long_data <- gather(wide_data,
key="variable",
value="value",
sex:income)
long_data
## # A tibble: 9 x 4
## id name variable value
## <dbl> <chr> <chr> <chr>
## 1 1 Bill sex Male
## 2 2 Bob sex Male
## 3 3 Mary sex Female
## 4 1 Bill age 22
## 5 2 Bob age 25
## 6 3 Mary age 18
## 7 1 Bill income 55000
## 8 2 Bob income 75000
## 9 3 Mary income 90000
library(tidyr)
wide_data <- spread(long_data, variable, value)
wide_data
## # A tibble: 3 x 5
## id name age income sex
## <dbl> <chr> <chr> <chr> <chr>
## 1 1 Bill 22 55000 Male
## 2 2 Bob 25 75000 Male
## 3 3 Mary 18 90000 Female