#Step 1: Randomly Split Data into Training and Test Sets #First, let’s load the data and split it into training and test sets:
# Load required libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Load the dataset
hulu_data <- read.csv("hulu_dataset.csv")
# Set seed for reproducibility
set.seed(123)
# Create index for splitting
split_index <- sample(1:nrow(hulu_data), 0.8 * nrow(hulu_data))
# Create training and test sets
train_data <- hulu_data[split_index, ]
test_data <- hulu_data[-split_index, ]
#Step 2: Evaluate Transformations for Quantitative Variables #Let’s examine the distribution of quantitative variables in the training set:
# Identify quantitative variables
quant_vars <- c("releaseYear", "imdbAverageRating", "imdbNumVotes")
# Create histograms for each quantitative variable
for (var in quant_vars) {
p <- ggplot(train_data, aes_string(x = var)) +
geom_histogram(bins = 30) +
ggtitle(paste("Distribution of", var))
print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 971 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 971 rows containing non-finite outside the scale range
## (`stat_bin()`).
#Step 3: Apply Transformations to the Training Data #Let’s apply the log transformation to imdbNumVotes:
train_data$log_imdbNumVotes <- log(train_data$imdbNumVotes)
#Step 4: Apply Transformations to the Test Data #Apply the same transformation to the test data:
test_data$log_imdbNumVotes <- log(test_data$imdbNumVotes)
#Step 5: Min-Max Scaling #Perform min-max scaling on the quantitative variables in the training data:
min_max_scale <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
scaled_vars <- c("releaseYear", "imdbAverageRating", "log_imdbNumVotes")
for (var in scaled_vars) {
train_data[paste0("scaled_", var)] <- min_max_scale(train_data[[var]])
}
#Step 6: Apply Scaling to the Test Set #Apply the same scaling to the test data using the min and max values from the training data:
for (var in scaled_vars) {
min_val <- min(train_data[[var]])
max_val <- max(train_data[[var]])
test_data[paste0("scaled_", var)] <- (test_data[[var]] - min_val) / (max_val - min_val)
}