library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Source: Kaggle - paintings
In 2013, students of the Statistics class at FSEV UK were asked to rate how much they like each one of 39 paintings (on a scale from 1 to 5). These comprise of 13 different art movements (exactly 3 paintings for each art movement).
S1-S48: students’ ratings, where one means “don’t like at all” (integer) art movement: the art movement the painting belongs to (categorical) artist: the author of the painting (categorical) painting: the name of the painting (categorical)
# load the paintings.csv dataset as tibble
paintings <- as_tibble (
read.csv("C:/Users/Owner/Desktop/My Documents/School/DataFiles/UntidyDatasets/paintings.csv")
)
str(paintings)
## tibble [39 x 51] (S3: tbl_df/tbl/data.frame)
## $ S1 : int [1:39] 3 2 1 5 2 5 4 2 1 3 ...
## $ S2 : int [1:39] 2 2 3 3 2 4 5 1 2 5 ...
## $ S3 : int [1:39] 3 2 3 1 2 4 4 1 2 5 ...
## $ S4 : int [1:39] 1 4 1 1 1 1 3 2 1 5 ...
## $ S5 : int [1:39] 1 1 1 1 1 1 2 2 1 2 ...
## $ S6 : int [1:39] 1 1 1 1 1 1 2 3 1 2 ...
## $ S7 : int [1:39] 2 2 5 3 1 3 5 1 3 5 ...
## $ S8 : int [1:39] 2 3 2 3 2 5 5 3 3 4 ...
## $ S9 : int [1:39] 3 4 1 1 1 2 3 2 1 4 ...
## $ S10 : int [1:39] 1 3 2 1 1 1 2 1 1 4 ...
## $ S11 : int [1:39] 2 3 2 1 1 2 4 2 1 3 ...
## $ S12 : int [1:39] 2 2 1 2 1 2 4 2 1 4 ...
## $ S13 : int [1:39] 2 4 1 2 1 2 3 4 1 5 ...
## $ S14 : int [1:39] 2 2 1 2 2 3 5 1 2 4 ...
## $ S15 : int [1:39] 2 5 1 1 2 3 4 2 1 5 ...
## $ S16 : int [1:39] 4 3 2 3 2 3 2 3 1 5 ...
## $ S17 : int [1:39] 5 3 5 5 5 5 4 3 5 4 ...
## $ S18 : int [1:39] 5 5 4 3 5 5 3 3 5 5 ...
## $ S19 : int [1:39] 4 2 1 5 2 5 5 1 1 5 ...
## $ S20 : int [1:39] 5 4 4 5 3 4 5 2 2 5 ...
## $ S21 : int [1:39] 4 3 1 1 1 3 5 1 1 4 ...
## $ S22 : int [1:39] 3 3 2 2 2 4 4 1 1 3 ...
## $ S23 : int [1:39] 1 3 3 4 3 4 2 1 2 2 ...
## $ S24 : int [1:39] 1 1 1 4 2 5 4 2 1 5 ...
## $ S25 : int [1:39] 5 2 4 1 4 1 4 1 1 1 ...
## $ S26 : int [1:39] 4 2 4 1 2 4 3 4 1 4 ...
## $ S27 : int [1:39] 4 3 5 2 2 5 2 4 2 3 ...
## $ S28 : int [1:39] 2 4 2 1 1 2 3 2 2 2 ...
## $ S29 : int [1:39] 2 1 1 1 1 2 2 1 1 1 ...
## $ S30 : int [1:39] 1 3 1 2 1 2 2 2 1 3 ...
## $ S31 : int [1:39] 1 1 1 1 1 1 3 1 1 1 ...
## $ S32 : int [1:39] 2 3 1 2 1 4 2 1 2 3 ...
## $ S33 : int [1:39] 2 5 2 1 2 3 2 1 1 5 ...
## $ S34 : int [1:39] 5 5 3 4 2 5 5 4 1 5 ...
## $ S35 : int [1:39] 1 4 2 2 3 3 4 4 1 4 ...
## $ S36 : int [1:39] 1 4 2 2 3 3 4 4 1 4 ...
## $ S37 : int [1:39] 3 2 1 2 1 3 4 2 1 4 ...
## $ S38 : int [1:39] 3 2 1 4 1 1 1 1 1 4 ...
## $ S39 : int [1:39] 2 3 1 3 1 1 4 3 1 2 ...
## $ S40 : int [1:39] 2 4 1 3 1 3 4 2 1 3 ...
## $ S41 : int [1:39] 3 4 2 2 2 1 4 4 2 5 ...
## $ S42 : int [1:39] 2 4 1 2 1 2 3 3 1 5 ...
## $ S43 : int [1:39] 1 4 1 3 1 2 4 3 1 1 ...
## $ S44 : int [1:39] 2 1 1 1 2 2 1 1 2 3 ...
## $ S45 : int [1:39] 2 4 4 1 1 3 2 3 3 4 ...
## $ S46 : int [1:39] 4 2 3 3 4 4 5 2 3 5 ...
## $ S47 : int [1:39] 2 2 1 2 2 1 3 4 1 1 ...
## $ S48 : int [1:39] 2 1 3 5 4 5 1 1 5 5 ...
## $ art.movement: chr [1:39] "Renaissance" "Renaissance" "Renaissance" "Baroque" ...
## $ artist : chr [1:39] "Sandro Botticelli" "Leonardo da Vinci" "Raphael" "Caravaggio" ...
## $ painting : chr [1:39] "The Birth of Venus" "Lady with an Ermine" "Three Graces" "Entombment" ...
The paintings dataset has 51 columns where the 48 student rankings are each separate columns. There are two approaches to tidying this dataset.
# tidy paintings.csv dataset
# tidy option 1 combine all student rankings into a single column
paintings %>%
pivot_longer(starts_with("s"), names_to = "student", values_to = "ranking")
## # A tibble: 1,872 x 5
## art.movement artist painting student ranking
## <chr> <chr> <chr> <chr> <int>
## 1 Renaissance Sandro Botticelli The Birth of Venus S1 3
## 2 Renaissance Sandro Botticelli The Birth of Venus S2 2
## 3 Renaissance Sandro Botticelli The Birth of Venus S3 3
## 4 Renaissance Sandro Botticelli The Birth of Venus S4 1
## 5 Renaissance Sandro Botticelli The Birth of Venus S5 1
## 6 Renaissance Sandro Botticelli The Birth of Venus S6 1
## 7 Renaissance Sandro Botticelli The Birth of Venus S7 2
## 8 Renaissance Sandro Botticelli The Birth of Venus S8 2
## 9 Renaissance Sandro Botticelli The Birth of Venus S9 3
## 10 Renaissance Sandro Botticelli The Birth of Venus S10 1
## # ... with 1,862 more rows
# tidy option 2 after combining rankings, sum and average them
paintings %>%
pivot_longer(starts_with("s"), names_to = "student", values_to = "ranking") %>%
group_by(art.movement, artist, painting) %>%
summarise(score = sum(ranking), avg = mean(ranking))
## `summarise()` has grouped output by 'art.movement', 'artist'. You can override using the `.groups` argument.
## # A tibble: 39 x 5
## # Groups: art.movement, artist [39]
## art.movement artist painting score avg
## <chr> <chr> <chr> <int> <dbl>
## 1 Abstract art Kazimir Malevich Black Square 86 1.79
## 2 Abstract art Piet Mondrian Composition II in Red, Blue, and ~ 95 1.98
## 3 Abstract art Wassily Kandinsky Moscow. Red Square 171 3.56
## 4 Art Nouveau Alfonz Mucha Four Seasons 190 3.96
## 5 Art Nouveau Gustav Klimt The Kiss 141 2.94
## 6 Art Nouveau Pierre Bonnard The Letter 145 3.02
## 7 Baroque Caravaggio Entombment 111 2.31
## 8 Baroque Diego Velazquez Rokeby Venus 90 1.88
## 9 Baroque Rembrandt van Ri~ The Night Watch 140 2.92
## 10 Cubism Georges Braque Violin and Candlestick 124 2.58
## # ... with 29 more rows
Source: econstor - Time Allocation
# load time_alloc dataset as tibble
time_alloc <- as_tibble(
read.csv("C:/Users/Owner/Desktop/My Documents/School/DataFiles/UntidyDatasets/TimeAllocation5decades.csv")
)
str(time_alloc)
## tibble [225 x 5] (S3: tbl_df/tbl/data.frame)
## $ Entity : chr [1:225] "Core Market Work (Men)" "Core Market Work (Men)" "Core Market Work (Men)" "Core Market Work (Men)" ...
## $ Year : int [1:225] 1965 1975 1985 1993 2003 1965 1975 1985 1993 2003 ...
## $ Hours.per.week.spent.in.market.and.non.market.work..Aguiar.and.Hurst..2006..: num [1:225] 42.1 38.8 35.7 38.1 35.9 ...
## $ Hours.per.week.spent.on.child.care..Aguiar.and.Hurst..2006.. : num [1:225] NA NA NA NA NA NA NA NA NA NA ...
## $ Hours.per.week.spent.in.leisure..Aguiar.and.Hurst..2006.. : num [1:225] NA NA NA NA NA NA NA NA NA NA ...
To tidy the time_alloc dataset the Entity column will split into two categorical variables, category and type. The three “Hours.per.week…” columns have mutually exclusive values for work, leisure and child care categories. These need to be combined into a single column for hours.
# tidy time_alloc dataset
time_alloc$Entity <- str_replace(time_alloc$Entity,"\\)", "") # clear the close paren from Entity column
time_alloc <- time_alloc %>%
rename_with(tolower) %>% # rename all cols to lower case
rename(work = contains(".work"), # change long names to simple names
child_care = contains("child.care"),
leisure = contains("leisure")) %>%
mutate(hours = coalesce(work, child_care, leisure)) %>% # combine work, child care and leisure into single column
separate(entity,into = c("category","type"), sep = " \\(") %>% # split entity into two columns at the open paren
select(category, type, year, hours) # select the columns to work with
head(time_alloc)
## # A tibble: 6 x 4
## category type year hours
## <chr> <chr> <int> <dbl>
## 1 Core Market Work Men 1965 42.1
## 2 Core Market Work Men 1975 38.8
## 3 Core Market Work Men 1985 35.7
## 4 Core Market Work Men 1993 38.1
## 5 Core Market Work Men 2003 35.9
## 6 Core Market Work Total 1965 28.2
Source: Kaggle - RollerCoasterData
# load rollercoaster dataset as tibble and take a look at it
rollercoaster <- as_tibble(
read.csv("C:/Users/Owner/Desktop/My Documents/School/DataFiles/TidyDatasets/RollerCoaster.csv")
)
head(rollercoaster)
## # A tibble: 6 x 14
## Coaster Park City State Type Design Year_Opened Top_Speed Max_Height Drop
## <chr> <chr> <chr> <chr> <chr> <chr> <int> <int> <int> <int>
## 1 Zippin ~ Libe~ Memp~ Tenn~ Wood~ Sit D~ 1915 40 70 70
## 2 Jack Ra~ Kenn~ West~ Penn~ Wood~ Sit D~ 1921 45 40 70
## 3 Thunder~ Dorn~ Alle~ Penn~ Wood~ Sit D~ 1923 45 80 65
## 4 Giant D~ Sant~ Sant~ Cali~ Wood~ Sit D~ 1924 55 70 65
## 5 Thunder~ Kenn~ West~ Penn~ Wood~ Sit D~ 1924 55 70 95
## 6 Wildcat Lake~ Bris~ Conn~ Wood~ Sit D~ 1927 48 85 78
## # ... with 4 more variables: Length <int>, Duration <int>, Inversions <chr>,
## # Num_of_Inversions <int>
Source: Kaggle - nyphil perf-history
# load nyphil dataset as tibble and take look at it
nyphil <- as_tibble(
read.csv("C:/Users/Owner/Desktop/My Documents/School/DataFiles/TidyDatasets/NYPhilconcerts.csv")
)
head(nyphil)
## # A tibble: 6 x 9
## Date Location Time Venue eventType season programID orchestra id
## <chr> <chr> <chr> <chr> <chr> <chr> <int> <chr> <chr>
## 1 1842-1~ Manhatta~ 8:00PM Apoll~ Subscrip~ 1842-~ 3853 New York P~ 38e072~
## 2 1843-0~ Manhatta~ 8:00PM Apoll~ Subscrip~ 1842-~ 5178 New York P~ c7b2b9~
## 3 1843-0~ Manhatta~ 8:00PM Apoll~ Special 1842-~ 10785 Musicians ~ 894e1a~
## 4 1843-0~ Manhatta~ 8:00PM Apoll~ Subscrip~ 1842-~ 5887 New York P~ 34ec2c~
## 5 1843-1~ Manhatta~ None Apoll~ Subscrip~ 1843-~ 305 New York P~ 610a4a~
## 6 1844-0~ Manhatta~ 8:00PM Apoll~ Subscrip~ 1843-~ 3368 New York P~ 47b6e9~