Data exploration with the dplyr, tidyr and stringr libraries. - Column subsetting - Filtering of rows - Boolean operators, Boolean algebra, de Morgan’s laws - Creating new columns (1x Challenge) - Missing values - Manipulating text (3x Challenge) - Aggregating data (1x Challenge) - Pivot tables, data in long and wide format - Merging tables
Useful resources:
- dplyr
cheatsheet
- tidyr
cheatsheet
- stringr
cheatsheet
- ggplot2
cheatsheet
- A.
Kassambara - Guide to Create Beautiful Graphics in R.
The data comes from https://flixgem.com/ (dataset version as of March 12, 2021). The data contains information on 9425 movies and series available on Netlix.
dane #takes the dane data frame
select(Title, Runtime, IMDb.Score, Release.Date) %>% #to choose only the columns 'Title,' 'Runtime,' 'IMDb.Score,' and 'Release.Date.'
head(5) #to display the first 5 rows of the resulting data frame
dane %>% #takes the dane data frame
select(-Netflix.Link, -IMDb.Link, -Image, -Poster, -TMDb.Trailer)%>% #to choose only the columns'Netflix.Link,' 'IMDb.Link,' 'Image,' 'Poster,' and 'TMDb.Trailer'
head(5) #to display the first 5 rows of the resulting data frame
dane %>% #takes the dane data frame
select(1:10)%>% #to choose only the columns from the 1st to the 10th column
head(5) #to display the first 5 rows of the resulting data frame
dane %>% #takes the dane data frame
select(Title:Runtime)%>% #to choose only the columns from 'Title' to 'Runtime'
head(5) #to display the first 5 rows of the resulting data frame
dane %>% #takes the dane data frame
select(starts_with('IMDb'))%>% #to select columns that start with the string 'IMDb'
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
select(ends_with('Score'))%>% #to select columns that end with the 'Score'
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
select(contains('Date'))%>% #to select columns that contain the word 'Date'
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
select(matches('^[a-z]{5,6}$')) %>% #to select columns whose names consist of 5 to 6 lowercase letters
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
select(-matches('\\.'))%>% #to select columns which do not contain a period ('.') in their names
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
select(IMDb.Score)%>% #to select the 'IMDb.Score' column
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
pull(IMDb.Score)%>% #to extract the 'IMDb.Score' column
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
pull(IMDb.Score, Title)%>% #to extract the 'IMDb.Score' and 'Title' columns
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
filter(Series.or.Movie == "Series")%>% #to filter the dane data frame to select only rows where the 'Series.or.Movie' column has the value "Series."
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
filter(IMDb.Score > 8)%>% #to select only rows where the 'IMDb.Score' column has a value greater than 8
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
filter(IMDb.Score >= 8 & Series.or.Movie == 'Series')%>% #to select only trows where the 'IMDb.Score' is greater than or equal to 8 and where the 'Series.or.Movie' is equal to 'Series', it retrieves rows with a high IMDb score that are specifically categorized as 'Series'.
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
filter(IMDb.Score >= 9 | IMDb.Votes < 1000)%>% #selects rows that meet either of the two conditions: rows where the 'IMDb.Score' is greater than or equal to 9, rows where the 'IMDb.Votes' is less than 1000.
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
filter(!(IMDb.Score >= 9 | IMDb.Votes < 1000))%>% #selects rows that do not meet either of the following conditions:rows where the 'IMDb.Score' is greater than or equal to 9, rows where the 'IMDb.Votes' is less than 1000
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
filter(!(IMDb.Score >= 9) & !(IMDb.Votes < 1000))%>% #selects rows that do not meet both of the following conditions: eows where the 'IMDb.Score' is greater than or equal to 9, rows where the 'IMDb.Votes' is less than 1000
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
mutate(score_category = if_else(IMDb.Score >= 5, 'Good', 'Poor')) %>% #to add a new column called 'score_category' to the data frame. The values in this column are assigned based on a condition: If the 'IMDb.Score' is greater than or equal to 5, it's labeled as 'Good'; otherwise, it's labeled as 'Poor'.
select(Title, IMDb.Score, score_category)%>% #to choose and display only the 'Title,' 'IMDb.Score,' and 'score_category' columns from the data frame.
head(10) #displays the first 10 rows of the resulting data frame
dane %>% #takes the dane data frame
transmute( #to create new variables
Release = Release.Date %>% as.Date(format = '%m/%d/%y') #to take the 'Release.Date' column and converts it into a specified date format.
,Netflix.Release = Netflix.Release.Date %>% as.Date(format = '%m/%d/%y') #to take the 'Netflix.Release'column and converts it into a specified date format.
)
CHALLENGE 1: What is the oldest Woody Allen film available on Netflix?
# your code goes here
dane %>%
filter(Director == "Woody Allen") %>%
mutate(Release = Release.Date %>% as.Date(format = '%m/%d/%Y')) %>%
mutate(Old = min_rank(Release)) %>%
filter(Old == 1) %>%
select(Director, Release, Old, Title)
## Director Release Old
## 1 Woody Allen 1972-08-06 1
## Title
## 1 Everything You Always Wanted to Know About Sex But Were Afraid to Ask
dane %>% #takes the dane data frame
mutate(score_category = case_when(
IMDb.Score <= 2 ~ 'Very Poor'
,IMDb.Score <= 4 ~ 'Poor'
,IMDb.Score <= 6 ~ 'Medium'
,IMDb.Score <= 8 ~ 'Good'
,IMDb.Score <= 10 ~ 'Very Good'
)) %>% #categorizes 'IMDb.Score' into different categories 'Very Poor,' 'Poor,' 'Medium,' 'Good,' and 'Very Good' based on the score ranges
select(Title, IMDb.Score, score_category)%>%
head(10) #selects and displays the 'Title,' 'IMDb.Score,' and 'score_category' columns for the first 10 rows of the modified data frame
dane %>% #takes the dane data frame
mutate(avg_score = mean(c(IMDb.Score * 10
,Hidden.Gem.Score * 10
,Rotten.Tomatoes.Score
,Metacritic.Score)
,na.rm = TRUE) %>% #calculates the average score from a combination of different score columns
round(2)) %>% #The resulting average score is then rounded to two decimal places
select(Title, avg_score)%>%
head(10) #selects and displays the 'Title' and 'avg_score' columns for the first 10 rows of the modified data frame
dane %>% #takes the dane data frame
rowwise() %>% #ensures that the mean() function calculates the average for each row rather than across all rows
mutate(avg_score = mean(c(IMDb.Score * 10
,Hidden.Gem.Score * 10
,Rotten.Tomatoes.Score
,Metacritic.Score)
,na.rm = TRUE) %>% #calculates the average score from a combination of different score columns
round(2)) %>% #The resulting average score is then rounded to two decimal places
select(Title, avg_score)%>%
head(10) #selects and displays the 'Title' and 'avg_score' columns for the first 10 rows of the modified data frame
dane %>% #takes the dane data frame
mutate(Popularity = if_else(IMDb.Votes > quantile(IMDb.Votes, 0.90, na.rm = TRUE), 'High', 'Not High')) %>% #to add a new 'Popularity' column based on the condition that checks if 'IMDb.Votes' is greater than the 90th percentile of 'IMDb.Votes'
relocate(Popularity, .after = Title) #to move the 'Popularity' column immediately after the 'Title' column in the data frame
dane %>% #takes the dane data frame
rename(
Tytul = Title
,Gatunek = Genre
) #to rename the 'Title' column to 'Tytul' and the 'Genre' column to 'Gatunek' in the data frame
dane %>% #takes the dane data frame
sapply(function(x) is.na(x) %>% sum()) #returns a count of missing values for each column in the data frame
dane %>% #takes the dane data frame
drop_na(Hidden.Gem.Score) #to remove rows where the 'Hidden.Gem.Score' column has missing values (NAs). It keeps only the rows with non-missing values in the specified column.
dane %>% #takes the dane data frame
mutate(Hidden.Gem.Score = replace_na(Hidden.Gem.Score, median(Hidden.Gem.Score, na.rm = TRUE))) %>% #to replace missing values in the 'Hidden.Gem.Score' column with the median of that column
sapply(function(x) is.na(x) %>% sum()) #to count the number of missing values in each column
dane %>% #takes the dane data frame
replace_na(list(Hidden.Gem.Score = median(dane$Hidden.Gem.Score, na.rm = TRUE))) %>% #to replace missing values in the 'Hidden.Gem.Score' column with the median of the 'Hidden.Gem.Score' column from the entire dane data frame. The na.rm = TRUE argument ensures that the median is calculated without considering missing values.
sapply(function(x) is.na(x) %>% sum()) #to count the number of missing values in each column after this replacement
gatunki = dane$Genre %>%
paste0(collapse = ', ') %>% #takes the 'Genre' column from the data frame and combines all genre values into a single comma-separated string.
str_extract_all('[A-Za-z]+') %>% #to extract words (probably genres) from the concatenated string
unlist() %>% #to convert the list of extracted genres into a vector
table() %>% #to create a frequency table of genre counts
as.data.frame() #to convert the table to a data frame
gatunki %>%
arrange(-Freq) #arranges the data frame 'gatunki' in order based on the frequency ('Freq') column. This provides a list of genres sorted by their popularity, with the most frequent genres listed first.
dane %>% #takes the dane data frame
mutate(poland_available = str_detect(Country.Availability, 'Poland')) %>% #to create a new column 'poland_available' that checks if the 'Country.Availability' column contains the string 'Poland'. If it does, it sets the value to TRUE.
filter(poland_available == TRUE) %>% #to select rows where 'poland_available' is TRUE, meaning the movie is available in Poland
pull(Title)%>% #extracts the 'Title' column from the filtered data frame
head(10) #to display the first 10 movie titles that are available in Poland based on the 'Country.Availability' column
dane %>% #takes the dane data frame
unite(
col = 'Scores'
,c('Hidden.Gem.Score', 'IMDb.Score', 'Rotten.Tomatoes.Score', 'Metacritic.Score') #to combine the values from the columns 'Hidden.Gem.Score,' 'IMDb.Score,' 'Rotten.Tomatoes.Score,' and 'Metacritic.Score' into a single new column called 'Scores'.
,sep = ', '
) %>% #to specify that the values from these columns should be separated by a comma and a space (', ')
select(Title, Scores)%>% #to choose and display only the 'Title' and 'Scores' columns from the modified data frame
head(10) #to display the first 10 rows of the modified data frame
CHALLENGE 2: What are the three highest rated comedies available in Polish?
# your code goes here
dane %>%
filter(Genre =="Comedy" & str_detect(Country.Availability, "Poland")) %>%
mutate(Ranking = min_rank(-IMDb.Score)) %>%
filter(Ranking <= 3) %>%
arrange(Ranking) %>%
select(Ranking, Title, IMDb.Score, Country.Availability)
## Ranking Title IMDb.Score
## 1 1 No Longer kids 9.0
## 2 2 Innocent 8.9
## 3 3 Aunty Donnas Big Ol House of Fun 8.8
## 4 3 Monty Pythons Flying Circus 8.8
## 5 3 Dave Chappelle 8.8
## 6 3 Still Game 8.8
## Country.Availability
## 1 United Kingdom,Russia,Lithuania,Canada,Czech Republic,Iceland,South Africa,India,Australia,Portugal,Hungary,Switzerland,Japan,Mexico,Belgium,Germany,Sweden,Greece,Hong Kong,Singapore,Thailand,Spain,France,Argentina,United States,Malaysia,Brazil,Netherlands,Israel,Italy,Poland,Turkey,Colombia
## 2 Romania,Belgium,Portugal,France,Australia,Japan,Singapore,Netherlands,Russia,Poland,Sweden,India,Slovakia,Germany,Lithuania,Czech Republic,Brazil,Israel,Spain,United Kingdom,Canada,Switzerland,Hong Kong,Argentina,United States,South Africa,Mexico,Greece,South Korea,Iceland,Italy,Thailand,Hungary,Turkey,Malaysia,Colombia
## 3 Lithuania,Australia,United Kingdom,Poland,Brazil,United States,India,Russia,Netherlands,Germany,Hong Kong,Japan,South Korea,Mexico,South Africa,France,Iceland,Thailand,Spain,Singapore,Greece,Argentina,Czech Republic,Israel,Portugal,Switzerland,Hungary,Slovakia,Canada,Italy,Sweden,Belgium,Turkey,Malaysia,Colombia,Romania
## 4 Hong Kong,Australia,Singapore,India,Netherlands,Sweden,Russia,Czech Republic,Slovakia,Israel,Lithuania,Argentina,Brazil,United Kingdom,Canada,Germany,France,Spain,Poland,Japan,Romania,Mexico,South Korea,Belgium,Greece,United States,Switzerland,Portugal,South Africa,Iceland,Italy,Thailand,Hungary,Turkey,Malaysia,Colombia
## 5 Russia,Lithuania,Germany,France,Iceland,Netherlands,India,Spain,Sweden,Belgium,Switzerland,Australia,United Kingdom,Brazil,Argentina,Mexico,Canada,United States,Israel,Hong Kong,Singapore,Portugal,Poland,Romania,Greece,South Africa,Czech Republic,Slovakia,Japan,Italy,Thailand,Hungary,South Korea,Turkey,Malaysia,Colombia
## 6 United Kingdom,Australia,Canada,United States,Lithuania,Czech Republic,Romania,Greece,India,South Africa,Singapore,Iceland,Slovakia,Thailand,Hungary,Russia,Japan,Poland,Hong Kong,Spain,Mexico,Switzerland,Belgium,Portugal,France,Argentina,Germany,Sweden,Turkey,Malaysia,Brazil,Italy,Israel,Netherlands,Colombia
CHALLENGE 3: For 2019 and 2020 productions, what is the average time between release and appearance on Netflix?
# your code goes here
dane %>%
mutate(Release = Release.Date %>% as.Date(format = '%m/%d/%Y')
,Netflix.Release = Netflix.Release.Date %>% as.Date(format = '%m/%d/%Y')) %>%
filter(year(Release) %in% c(2019,2020)) %>%
mutate(DIFF = Netflix.Release - Release) %>%
select(Title, Release, Netflix.Release, DIFF) %>%
summarise(avg = mean(DIFF))
## avg
## 1 107.0268 days
CHALLENGE 4: What are the most popular tags for productions available in Polish?
# your code goes here
dane %>%
filter(str_detect(Country.Availability, "Poland")) %>%
select(Tags) %>%
separate_rows(Tags, sep = ",") %>%
count(Tags) %>%
mutate(popular = min_rank(-n), ) %>%
arrange(popular)
## # A tibble: 769 × 3
## Tags n popular
## <chr> <int> <int>
## 1 Dramas 813 1
## 2 Comedies 731 2
## 3 TV Dramas 450 3
## 4 TV Programmes 402 4
## 5 Documentaries 394 5
## 6 US Movies 362 6
## 7 Action & Adventure 338 7
## 8 International Dramas 303 8
## 9 TV Comedies 275 9
## 10 US TV Shows 230 10
## # ℹ 759 more rows
dane %>% #takes the dane data frame
group_by(Series.or.Movie) %>% #to group the data by the 'Series.or.Movie' column
summarize( #to calculate various summary statistics
count = n() #Counts the number of observations
,avg_imdb_score = mean(IMDb.Score, na.rm = TRUE) %>% round(2) #Calculates the average 'IMDb.Score' within each group while rounding to two decimal places
,avg_imdb_votes = mean(IMDb.Votes, na.rm = TRUE) %>% round(0) #Calculates the average 'IMDb.Votes' within each group and rounds it to the nearest whole number
,sum_awards = sum(Awards.Received, na.rm = TRUE) #Sums the 'Awards.Received' within each group
)
dane %>% #takes the dane data frame
group_by(Series.or.Movie, Runtime) %>% #to group by two variables: 'Series.or.Movie' and 'Runtime'
summarize(n = n()) %>% #calculates the number of observations (count) in each group and assigns it to the variable 'n'
arrange(-n) #to sort in order based on the count ('n')
CHALLENGE 5: What are the average ratings of films produced in each decade (i.e., 1960s, 1970s, 1980s, 1990s, etc.)?
# your code goes here
dane %>%
mutate(Release = Release.Date %>% as.Date(format = '%m/%d/%Y')
,Netflix.Release = Netflix.Release.Date %>% as.Date(format = '%m/%d/%Y')) %>%
mutate(Decade = 10 * (year(Release) %/% 10)) %>%
select(Title, Release, Decade, IMDb.Score) %>%
drop_na(Decade, IMDb.Score) %>%
group_by(Decade) %>%
summarise(avg = mean(IMDb.Score))
## # A tibble: 10 × 2
## Decade avg
## <dbl> <dbl>
## 1 1930 7.46
## 2 1940 7.43
## 3 1950 7.37
## 4 1960 7.46
## 5 1970 7.33
## 6 1980 7.11
## 7 1990 6.88
## 8 2000 6.85
## 9 2010 6.94
## 10 2020 7.04
dane_pivot = dane %>%
select(Title, ends_with('Score')) #selects the 'Title' column and all columns that end with 'Score' from the dane data frame, creating a new data frame named 'dane_pivot' with these selected columns
dane_pivot = dane_pivot %>%
pivot_longer( #reshapes the data from a wide format to a long format by
cols = 2:5 #Columns 2 through 5 (which contain the various scores) are being melted into a single column
,names_to = 'Attribute'#The column that will store the names of the melted variables is named 'Attribute'
,values_to = 'Value'#The column that will store the values of the melted variables is named 'Value'
)
dane_pivot = dane_pivot %>%
pivot_wider( #reshapes the data back from the long format to a wide format by
id_cols = 1 #The 'Title' column is specified as the identifier column that remains unchanged
,names_from = 'Attribute'#The 'Attribute' column from the long format becomes the new column names in the wide format
,values_from = 'Value' #The 'Value' column from the long format becomes the values for the new columns in the wide format
) #Result: to convert the original wide format data frame into a long format, and then back into a wide format, while keeping the 'Title' column as the identifier. The result is a data frame where each score type (e.g., 'Hidden.Gem.Score,' 'IMDb.Score,' etc.) is in its own column.
oceny_metacritic = dane %>%
select(Title, Metacritic.Score) %>% #selects the 'Title' and 'Metacritic.Score' columns from the dane data fram
.[1:100,] %>% #filters the data to retain only the first 100 rows
drop_na() #removes rows with missing values in the 'Metacritic.Score' column
oceny_rotten_tomatoes = dane %>%
select(Title, Rotten.Tomatoes.Score) %>% #selects the 'Title' and 'Rotten.Tomatoes.Score' columns from dane data frame
.[1:100,] %>% #filters the data to retain only the first 100 rows
drop_na() #removes rows with missing values (NAs) in the 'Rotten.Tomatoes.Score' column
oceny_metacritic %>%
left_join(oceny_rotten_tomatoes, by = c('Title' = 'Title')) #to perform a left join between 'oceny_metacritic' and 'oceny_rotten_tomatoes' on the 'Title' column.
#The result is a merged data frame that combines information from both 'oceny_metacritic' and 'oceny_rotten_tomatoes,' where rows with matching 'Title' values are joined together.