Grp3 - Assignment 1 - Netflix Dataset

1. Print the structure of your dataset

str(netflix)

## tibble [8,807 × 12] (S3: tbl_df/tbl/data.frame)
##  $ show_id     : chr [1:8807] "s1" "s2" "s3" "s4" ...
##  $ type        : chr [1:8807] "Movie" "TV Show" "TV Show" "TV Show" ...
##  $ title       : chr [1:8807] "Dick Johnson Is Dead" "Blood & Water" "Ganglands" "Jailbirds New Orleans" ...
##  $ director    : chr [1:8807] "Kirsten Johnson" NA "Julien Leclercq" NA ...
##  $ cast        : chr [1:8807] NA "Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile "| __truncated__ "Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, G"| __truncated__ NA ...
##  $ country     : chr [1:8807] "United States" "South Africa" NA NA ...
##  $ date_added  : POSIXct[1:8807], format: "2021-09-25" "2021-09-24" ...
##  $ release_year: num [1:8807] 2020 2021 2021 2021 2021 ...
##  $ rating      : chr [1:8807] "PG-13" "TV-MA" "TV-MA" "TV-MA" ...
##  $ duration    : chr [1:8807] "90 min" "2 Seasons" "1 Season" "1 Season" ...
##  $ listed_in   : chr [1:8807] "Documentaries" "International TV Shows, TV Dramas, TV Mysteries" "Crime TV Shows, International TV Shows, TV Action & Adventure" "Docuseries, Reality TV" ...
##  $ description : chr [1:8807] "As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical wa"| __truncated__ "After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is h"| __truncated__ "To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled "| __truncated__ "Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Or"| __truncated__ ...

2. List the variables in your dataset

names(netflix)

##  [1] "show_id"      "type"         "title"        "director"     "cast"        
##  [6] "country"      "date_added"   "release_year" "rating"       "duration"    
## [11] "listed_in"    "description"

3. Print the top 15 rows of your dataset

head(netflix, 15)

4. User-defined function using a variable

# Function to count how many shows exist for a given rating
count_by_rating <- function(rating_value) {
  sum(netflix$rating == rating_value, na.rm = TRUE)
}

# Example usage
count_by_rating("TV-MA")

## [1] 3207

5. Filter rows based on logical criteria

# Filter shows from Canada with TV-14 rating
filtered_shows <- filter(netflix, country == "Canada", rating == "TV-14")
head(filtered_shows)

6. Dependent & Independent Variables and Reshaping

# Assume 'rating' depends on 'country'
reshaped_data <- select(netflix, country, rating)
reshaped_data <- drop_na(reshaped_data)
head(reshaped_data)

7. Remove missing values

netflix_clean <- na.omit(netflix)

8. Identify and remove duplicated data

netflix_no_duplicates <- netflix_clean[!duplicated(netflix_clean), ]

9. Reorder multiple rows in descending order

# Descending order by 'duration'
netflix_sorted <- arrange(netflix_no_duplicates, desc(duration))
head(netflix_sorted)

10. Rename some of the column names

netflix_renamed <- rename(netflix_no_duplicates,
                          ContentType = type,
                          CountryName = country,
                          ContentRating = rating,
                          WatchDuration = duration,
                          Genre = `listed_in`)

11. Add new variables using a mathematical function

# Add new variable: Word count of 'duration'
netflix_renamed$Duration_Length <- nchar(netflix_renamed$WatchDuration)
netflix_renamed$Double_Duration_Length <- netflix_renamed$Duration_Length * 2

head(select(netflix_renamed, WatchDuration, Duration_Length, Double_Duration_Length))

12. Create a training set using random number generator

set.seed(42)
train_set <- sample_n(netflix_renamed, 100)
head(train_set)

13. Summary statistics

summary(netflix_renamed)

##    show_id          ContentType           title             director        
##  Length:5328        Length:5328        Length:5328        Length:5328       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      cast           CountryName          date_added                 
##  Length:5328        Length:5328        Min.   :2008-01-01 00:00:00  
##  Class :character   Class :character   1st Qu.:2018-04-06 18:00:00  
##  Mode  :character   Mode  :character   Median :2019-06-18 00:00:00  
##                                        Mean   :2019-04-29 02:27:01  
##                                        3rd Qu.:2020-06-27 06:00:00  
##                                        Max.   :2021-09-24 00:00:00  
##   release_year  ContentRating      WatchDuration         Genre          
##  Min.   :1942   Length:5328        Length:5328        Length:5328       
##  1st Qu.:2011   Class :character   Class :character   Class :character  
##  Median :2016   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2013                                                           
##  3rd Qu.:2018                                                           
##  Max.   :2021                                                           
##  description        Duration_Length  Double_Duration_Length
##  Length:5328        Min.   : 5.000   Min.   :10.00         
##  Class :character   1st Qu.: 6.000   1st Qu.:12.00         
##  Mode  :character   Median : 7.000   Median :14.00         
##                     Mean   : 6.564   Mean   :13.13         
##                     3rd Qu.: 7.000   3rd Qu.:14.00         
##                     Max.   :10.000   Max.   :20.00

14. Statistical functions: Mean, Median, Mode, Range

# Use Duration_Length (numeric) for stats
mean(netflix_renamed$Duration_Length)

## [1] 6.564189

median(netflix_renamed$Duration_Length)

## [1] 7

# Mode function
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}
getmode(netflix_renamed$Duration_Length)

## [1] 7

range(netflix_renamed$Duration_Length)

## [1]  5 10

15. Scatter plot for 2 variables

# Convert duration to numeric (for movies only)
netflix_movies <- filter(netflix, type == "Movie")

# Extract numeric duration from "duration" column (e.g., "90 min" → 90)
netflix_movies$duration_num <- as.numeric(gsub(" min", "", netflix_movies$duration))

# Scatter plot
ggplot(netflix_movies, aes(x = release_year, y = duration_num)) +
  geom_point(alpha = 0.5, color = "darkred") +
  labs(
    title = "Scatter Plot: Movie Duration vs Release Year",
    x = "Release Year",
    y = "Duration (minutes)"
  ) +
  theme_minimal()

## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

16. Bar plot for 2 variables

# Count number of entries for each type
type_count <- netflix %>%
  group_by(type) %>%
  summarise(count = n())

# Bar plot
ggplot(type_count, aes(x = type, y = count, fill = type)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Bar Plot: Number of Movies vs TV Shows",
    x = "Type",
    y = "Count"
  ) +
  theme_minimal()

17. Pearson correlation between two variables

# Correlation between duration character length and its double
cor(netflix_renamed$Duration_Length, netflix_renamed$Double_Duration_Length, method = "pearson")

## [1] 1

Grp3 - Assignment 1 - Netflix Dataset

Group 3

2025-08-03

1. Print the structure of your dataset

2. List the variables in your dataset

3. Print the top 15 rows of your dataset

4. User-defined function using a variable

5. Filter rows based on logical criteria

6. Dependent & Independent Variables and Reshaping

7. Remove missing values

8. Identify and remove duplicated data

9. Reorder multiple rows in descending order

10. Rename some of the column names

11. Add new variables using a mathematical function

12. Create a training set using random number generator

13. Summary statistics

14. Statistical functions: Mean, Median, Mode, Range

15. Scatter plot for 2 variables

16. Bar plot for 2 variables

17. Pearson correlation between two variables

Footer