2. List the variables in your dataset

names(netflix)
##  [1] "show_id"      "type"         "title"        "director"     "cast"        
##  [6] "country"      "date_added"   "release_year" "rating"       "duration"    
## [11] "listed_in"    "description"

4. User-defined function using a variable

# Function to count how many shows exist for a given rating
count_by_rating <- function(rating_value) {
  sum(netflix$rating == rating_value, na.rm = TRUE)
}

# Example usage
count_by_rating("TV-MA")
## [1] 3207

5. Filter rows based on logical criteria

# Filter shows from Canada with TV-14 rating
filtered_shows <- filter(netflix, country == "Canada", rating == "TV-14")
head(filtered_shows)

6. Dependent & Independent Variables and Reshaping

# Assume 'rating' depends on 'country'
reshaped_data <- select(netflix, country, rating)
reshaped_data <- drop_na(reshaped_data)
head(reshaped_data)

7. Remove missing values

netflix_clean <- na.omit(netflix)

8. Identify and remove duplicated data

netflix_no_duplicates <- netflix_clean[!duplicated(netflix_clean), ]

9. Reorder multiple rows in descending order

# Descending order by 'duration'
netflix_sorted <- arrange(netflix_no_duplicates, desc(duration))
head(netflix_sorted)

10. Rename some of the column names

netflix_renamed <- rename(netflix_no_duplicates,
                          ContentType = type,
                          CountryName = country,
                          ContentRating = rating,
                          WatchDuration = duration,
                          Genre = `listed_in`)

11. Add new variables using a mathematical function

# Add new variable: Word count of 'duration'
netflix_renamed$Duration_Length <- nchar(netflix_renamed$WatchDuration)
netflix_renamed$Double_Duration_Length <- netflix_renamed$Duration_Length * 2

head(select(netflix_renamed, WatchDuration, Duration_Length, Double_Duration_Length))

12. Create a training set using random number generator

set.seed(42)
train_set <- sample_n(netflix_renamed, 100)
head(train_set)

13. Summary statistics

summary(netflix_renamed)
##    show_id          ContentType           title             director        
##  Length:5328        Length:5328        Length:5328        Length:5328       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      cast           CountryName          date_added                 
##  Length:5328        Length:5328        Min.   :2008-01-01 00:00:00  
##  Class :character   Class :character   1st Qu.:2018-04-06 18:00:00  
##  Mode  :character   Mode  :character   Median :2019-06-18 00:00:00  
##                                        Mean   :2019-04-29 02:27:01  
##                                        3rd Qu.:2020-06-27 06:00:00  
##                                        Max.   :2021-09-24 00:00:00  
##   release_year  ContentRating      WatchDuration         Genre          
##  Min.   :1942   Length:5328        Length:5328        Length:5328       
##  1st Qu.:2011   Class :character   Class :character   Class :character  
##  Median :2016   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2013                                                           
##  3rd Qu.:2018                                                           
##  Max.   :2021                                                           
##  description        Duration_Length  Double_Duration_Length
##  Length:5328        Min.   : 5.000   Min.   :10.00         
##  Class :character   1st Qu.: 6.000   1st Qu.:12.00         
##  Mode  :character   Median : 7.000   Median :14.00         
##                     Mean   : 6.564   Mean   :13.13         
##                     3rd Qu.: 7.000   3rd Qu.:14.00         
##                     Max.   :10.000   Max.   :20.00

14. Statistical functions: Mean, Median, Mode, Range

# Use Duration_Length (numeric) for stats
mean(netflix_renamed$Duration_Length)
## [1] 6.564189
median(netflix_renamed$Duration_Length)
## [1] 7
# Mode function
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}
getmode(netflix_renamed$Duration_Length)
## [1] 7
range(netflix_renamed$Duration_Length)
## [1]  5 10

15. Scatter plot for 2 variables

# Convert duration to numeric (for movies only)
netflix_movies <- filter(netflix, type == "Movie")

# Extract numeric duration from "duration" column (e.g., "90 min" → 90)
netflix_movies$duration_num <- as.numeric(gsub(" min", "", netflix_movies$duration))

# Scatter plot
ggplot(netflix_movies, aes(x = release_year, y = duration_num)) +
  geom_point(alpha = 0.5, color = "darkred") +
  labs(
    title = "Scatter Plot: Movie Duration vs Release Year",
    x = "Release Year",
    y = "Duration (minutes)"
  ) +
  theme_minimal()
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

16. Bar plot for 2 variables

# Count number of entries for each type
type_count <- netflix %>%
  group_by(type) %>%
  summarise(count = n())

# Bar plot
ggplot(type_count, aes(x = type, y = count, fill = type)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Bar Plot: Number of Movies vs TV Shows",
    x = "Type",
    y = "Count"
  ) +
  theme_minimal()

17. Pearson correlation between two variables

# Correlation between duration character length and its double
cor(netflix_renamed$Duration_Length, netflix_renamed$Double_Duration_Length, method = "pearson")
## [1] 1