1. Print the structure of your dataset
str(netflix)
## tibble [8,807 × 12] (S3: tbl_df/tbl/data.frame)
## $ show_id : chr [1:8807] "s1" "s2" "s3" "s4" ...
## $ type : chr [1:8807] "Movie" "TV Show" "TV Show" "TV Show" ...
## $ title : chr [1:8807] "Dick Johnson Is Dead" "Blood & Water" "Ganglands" "Jailbirds New Orleans" ...
## $ director : chr [1:8807] "Kirsten Johnson" NA "Julien Leclercq" NA ...
## $ cast : chr [1:8807] NA "Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile "| __truncated__ "Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, G"| __truncated__ NA ...
## $ country : chr [1:8807] "United States" "South Africa" NA NA ...
## $ date_added : POSIXct[1:8807], format: "2021-09-25" "2021-09-24" ...
## $ release_year: num [1:8807] 2020 2021 2021 2021 2021 ...
## $ rating : chr [1:8807] "PG-13" "TV-MA" "TV-MA" "TV-MA" ...
## $ duration : chr [1:8807] "90 min" "2 Seasons" "1 Season" "1 Season" ...
## $ listed_in : chr [1:8807] "Documentaries" "International TV Shows, TV Dramas, TV Mysteries" "Crime TV Shows, International TV Shows, TV Action & Adventure" "Docuseries, Reality TV" ...
## $ description : chr [1:8807] "As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical wa"| __truncated__ "After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is h"| __truncated__ "To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled "| __truncated__ "Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Or"| __truncated__ ...
2. List the variables in your dataset
names(netflix)
## [1] "show_id" "type" "title" "director" "cast"
## [6] "country" "date_added" "release_year" "rating" "duration"
## [11] "listed_in" "description"
3. Print the top 15 rows of your dataset
head(netflix, 15)
4. User-defined function using a variable
# Function to count how many shows exist for a given rating
count_by_rating <- function(rating_value) {
sum(netflix$rating == rating_value, na.rm = TRUE)
}
# Example usage
count_by_rating("TV-MA")
## [1] 3207
5. Filter rows based on logical criteria
# Filter shows from Canada with TV-14 rating
filtered_shows <- filter(netflix, country == "Canada", rating == "TV-14")
head(filtered_shows)
6. Dependent & Independent Variables and Reshaping
# Assume 'rating' depends on 'country'
reshaped_data <- select(netflix, country, rating)
reshaped_data <- drop_na(reshaped_data)
head(reshaped_data)
7. Remove missing values
netflix_clean <- na.omit(netflix)
8. Identify and remove duplicated data
netflix_no_duplicates <- netflix_clean[!duplicated(netflix_clean), ]
9. Reorder multiple rows in descending order
# Descending order by 'duration'
netflix_sorted <- arrange(netflix_no_duplicates, desc(duration))
head(netflix_sorted)
10. Rename some of the column names
netflix_renamed <- rename(netflix_no_duplicates,
ContentType = type,
CountryName = country,
ContentRating = rating,
WatchDuration = duration,
Genre = `listed_in`)
11. Add new variables using a mathematical function
# Add new variable: Word count of 'duration'
netflix_renamed$Duration_Length <- nchar(netflix_renamed$WatchDuration)
netflix_renamed$Double_Duration_Length <- netflix_renamed$Duration_Length * 2
head(select(netflix_renamed, WatchDuration, Duration_Length, Double_Duration_Length))
12. Create a training set using random number generator
set.seed(42)
train_set <- sample_n(netflix_renamed, 100)
head(train_set)
13. Summary statistics
summary(netflix_renamed)
## show_id ContentType title director
## Length:5328 Length:5328 Length:5328 Length:5328
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## cast CountryName date_added
## Length:5328 Length:5328 Min. :2008-01-01 00:00:00
## Class :character Class :character 1st Qu.:2018-04-06 18:00:00
## Mode :character Mode :character Median :2019-06-18 00:00:00
## Mean :2019-04-29 02:27:01
## 3rd Qu.:2020-06-27 06:00:00
## Max. :2021-09-24 00:00:00
## release_year ContentRating WatchDuration Genre
## Min. :1942 Length:5328 Length:5328 Length:5328
## 1st Qu.:2011 Class :character Class :character Class :character
## Median :2016 Mode :character Mode :character Mode :character
## Mean :2013
## 3rd Qu.:2018
## Max. :2021
## description Duration_Length Double_Duration_Length
## Length:5328 Min. : 5.000 Min. :10.00
## Class :character 1st Qu.: 6.000 1st Qu.:12.00
## Mode :character Median : 7.000 Median :14.00
## Mean : 6.564 Mean :13.13
## 3rd Qu.: 7.000 3rd Qu.:14.00
## Max. :10.000 Max. :20.00
15. Scatter plot for 2 variables
# Convert duration to numeric (for movies only)
netflix_movies <- filter(netflix, type == "Movie")
# Extract numeric duration from "duration" column (e.g., "90 min" → 90)
netflix_movies$duration_num <- as.numeric(gsub(" min", "", netflix_movies$duration))
# Scatter plot
ggplot(netflix_movies, aes(x = release_year, y = duration_num)) +
geom_point(alpha = 0.5, color = "darkred") +
labs(
title = "Scatter Plot: Movie Duration vs Release Year",
x = "Release Year",
y = "Duration (minutes)"
) +
theme_minimal()
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

16. Bar plot for 2 variables
# Count number of entries for each type
type_count <- netflix %>%
group_by(type) %>%
summarise(count = n())
# Bar plot
ggplot(type_count, aes(x = type, y = count, fill = type)) +
geom_bar(stat = "identity") +
labs(
title = "Bar Plot: Number of Movies vs TV Shows",
x = "Type",
y = "Count"
) +
theme_minimal()

17. Pearson correlation between two variables
# Correlation between duration character length and its double
cor(netflix_renamed$Duration_Length, netflix_renamed$Double_Duration_Length, method = "pearson")
## [1] 1