Week6

#Step 1: Randomly Split Data into Training and Test Sets #First, let’s load the data and split it into training and test sets:

# Load required libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

# Load the dataset
hulu_data <- read.csv("hulu_dataset.csv")

# Set seed for reproducibility
set.seed(123)

# Create index for splitting
split_index <- sample(1:nrow(hulu_data), 0.8 * nrow(hulu_data))

# Create training and test sets
train_data <- hulu_data[split_index, ]
test_data <- hulu_data[-split_index, ]

#Step 2: Evaluate Transformations for Quantitative Variables #Let’s examine the distribution of quantitative variables in the training set:

# Identify quantitative variables
quant_vars <- c("releaseYear", "imdbAverageRating", "imdbNumVotes")

# Create histograms for each quantitative variable
for (var in quant_vars) {
  p <- ggplot(train_data, aes_string(x = var)) +
    geom_histogram(bins = 30) +
    ggtitle(paste("Distribution of", var))
  print(p)
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 971 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 971 rows containing non-finite outside the scale range
## (`stat_bin()`).

#Step 3: Apply Transformations to the Training Data #Let’s apply the log transformation to imdbNumVotes:

train_data$log_imdbNumVotes <- log(train_data$imdbNumVotes)

#Step 4: Apply Transformations to the Test Data #Apply the same transformation to the test data:

test_data$log_imdbNumVotes <- log(test_data$imdbNumVotes)

#Step 5: Min-Max Scaling #Perform min-max scaling on the quantitative variables in the training data:

min_max_scale <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}

scaled_vars <- c("releaseYear", "imdbAverageRating", "log_imdbNumVotes")

for (var in scaled_vars) {
  train_data[paste0("scaled_", var)] <- min_max_scale(train_data[[var]])
}

#Step 6: Apply Scaling to the Test Set #Apply the same scaling to the test data using the min and max values from the training data:

for (var in scaled_vars) {
  min_val <- min(train_data[[var]])
  max_val <- max(train_data[[var]])
  test_data[paste0("scaled_", var)] <- (test_data[[var]] - min_val) / (max_val - min_val)
}

Week6

Tedros Habtemariam

2024-12-16