# Load the necessary libraries
library(DBI)
library(RMySQL)
# Connect to the database
conn <- dbConnect(
MySQL(),
user = 'jose.fuentes10',
password = 'bc905859f24769a7',
host = 'cunydata607sql.mysql.database.azure.com',
port = 3306,
dbname = 'jose.fuentes10'
)
# Select the database
dbExecute(conn, "USE `jose.fuentes10`;")
## [1] 0
# Describe table schema
schema <- dbGetQuery(conn, "DESCRIBE `movie_ratings`;")
print(schema)
## Field Type Null Key Default Extra
## 1 my_row_id bigint unsigned NO PRI <NA> auto_increment INVISIBLE
## 2 movie varchar(255) YES <NA>
## 3 Carlos int YES <NA>
## 4 Daniel int YES <NA>
## 5 Alex int YES <NA>
## 6 Ronald int YES <NA>
## 7 Aidde int YES <NA>
# Query the database and fetch data into a Dataframe
df <- dbGetQuery(conn, "SELECT * FROM `movie_ratings`;")
# Print the first few rows of the dataframe
head(df)
## movie Carlos Daniel Alex Ronald Aidde
## 1 Dead pool and Wolverine 5 4 3 5 4
## 2 Inside Out 2 4 5 5 4 5
## 3 Exorcist Believer 4 4 3 4 3
## 4 The Super Mario Bros. Movie 3 4 5 3 4
## 5 Abigail 3 4 2 3 3
## 6 Mean Girls (2024) 2 3 4 4 3
# Summary of the data frame
summary(df)
## movie Carlos Daniel Alex Ronald
## Length:6 Min. :2.0 Min. :3 Min. :2.000 Min. :3.000
## Class :character 1st Qu.:3.0 1st Qu.:4 1st Qu.:3.000 1st Qu.:3.250
## Mode :character Median :3.5 Median :4 Median :3.500 Median :4.000
## Mean :3.5 Mean :4 Mean :3.667 Mean :3.833
## 3rd Qu.:4.0 3rd Qu.:4 3rd Qu.:4.750 3rd Qu.:4.000
## Max. :5.0 Max. :5 Max. :5.000 Max. :5.000
## Aidde
## Min. :3.000
## 1st Qu.:3.000
## Median :3.500
## Mean :3.667
## 3rd Qu.:4.000
## Max. :5.000
# View the first few rows of the data frame
head(df)
## movie Carlos Daniel Alex Ronald Aidde
## 1 Dead pool and Wolverine 5 4 3 5 4
## 2 Inside Out 2 4 5 5 4 5
## 3 Exorcist Believer 4 4 3 4 3
## 4 The Super Mario Bros. Movie 3 4 5 3 4
## 5 Abigail 3 4 2 3 3
## 6 Mean Girls (2024) 2 3 4 4 3
# Missing Data Strategy
# Load data from SQL database
df <- dbGetQuery(conn, "SELECT * FROM movie_ratings;")
# Ensure all rating columns are numeric
df$Carlos <- as.numeric(df$Carlos)
df$Daniel <- as.numeric(df$Daniel)
df$Alex <- as.numeric(df$Alex)
df$Ronald <- as.numeric(df$Ronald)
df$Aidde <- as.numeric(df$Aidde)
# Check the structure of the data
str(df)
## 'data.frame': 6 obs. of 6 variables:
## $ movie : chr "Dead pool and Wolverine" "Inside Out 2" "Exorcist Believer" "The Super Mario Bros. Movie" ...
## $ Carlos: num 5 4 4 3 3 2
## $ Daniel: num 4 5 4 4 4 3
## $ Alex : num 3 5 3 5 2 4
## $ Ronald: num 5 4 4 3 3 4
## $ Aidde : num 4 5 3 4 3 3
# Calculate row means for the relevant columns (ignoring NAs)
row_means <- rowMeans(df[, c("Carlos", "Daniel", "Alex", "Ronald", "Aidde")], na.rm = TRUE)
# Impute missing values with row means
df$Carlos[is.na(df$Carlos)] <- row_means[is.na(df$Carlos)]
df$Daniel[is.na(df$Daniel)] <- row_means[is.na(df$Daniel)]
df$Alex[is.na(df$Alex)] <- row_means[is.na(df$Alex)]
df$Ronald[is.na(df$Ronald)] <- row_means[is.na(df$Ronald)]
df$Aidde[is.na(df$Aidde)] <- row_means[is.na(df$Aidde)]
# Replace NA values with the row median for each column
df$Carlos[is.na(df$Carlos)] <- apply(df[, c("Carlos", "Daniel", "Alex", "Ronald", "Aidde")], 1, function(x) median(x, na.rm = TRUE))
df$Daniel[is.na(df$Daniel)] <- apply(df[, c("Carlos", "Daniel", "Alex", "Ronald", "Aidde")], 1, function(x) median(x, na.rm = TRUE))
df$Alex[is.na(df$Alex)] <- apply(df[, c("Carlos", "Daniel", "Alex", "Ronald", "Aidde")], 1, function(x) median(x, na.rm = TRUE))
df$Ronald[is.na(df$Ronald)] <- apply(df[, c("Carlos", "Daniel", "Alex", "Ronald", "Aidde")], 1, function(x) median(x, na.rm = TRUE))
df$Aidde[is.na(df$Aidde)] <- apply(df[, c("Carlos", "Daniel", "Alex", "Ronald", "Aidde")], 1, function(x) median(x, na.rm = TRUE))
# Check for missing values
colSums(is.na(df))
## movie Carlos Daniel Alex Ronald Aidde
## 0 0 0 0 0 0
# Check the imputed data
head(df)
## movie Carlos Daniel Alex Ronald Aidde
## 1 Dead pool and Wolverine 5 4 3 5 4
## 2 Inside Out 2 4 5 5 4 5
## 3 Exorcist Believer 4 4 3 4 3
## 4 The Super Mario Bros. Movie 3 4 5 3 4
## 5 Abigail 3 4 2 3 3
## 6 Mean Girls (2024) 2 3 4 4 3
#The Missing Data approach: Imputation was used to handle missing data, allowing the data set to remain complete for analysis.
#Mean imputation was applied to fill missing values with the average, providing a balanced estimate.
#Median imputation was used to account for skewed data, ensuring outliers didn’t distort the results.
#This combination preserved data integrity and statistical power while addressing missing values effectively.