# Load the necessary libraries
library(DBI)
library(RMySQL)

# Connect to the database
conn <- dbConnect(
  MySQL(), 
  user = 'jose.fuentes10',               
  password = 'bc905859f24769a7',            
  host = 'cunydata607sql.mysql.database.azure.com',  
  port = 3306,                              
  dbname = 'jose.fuentes10'                
)

# Select the database
dbExecute(conn, "USE `jose.fuentes10`;")
## [1] 0
# Describe table schema
schema <- dbGetQuery(conn, "DESCRIBE `movie_ratings`;")
print(schema)
##       Field            Type Null Key Default                    Extra
## 1 my_row_id bigint unsigned   NO PRI    <NA> auto_increment INVISIBLE
## 2     movie    varchar(255)  YES        <NA>                         
## 3    Carlos             int  YES        <NA>                         
## 4    Daniel             int  YES        <NA>                         
## 5      Alex             int  YES        <NA>                         
## 6    Ronald             int  YES        <NA>                         
## 7     Aidde             int  YES        <NA>
# Query the database and fetch data into a Dataframe
df <- dbGetQuery(conn, "SELECT * FROM `movie_ratings`;")

# Print the first few rows of the dataframe
head(df)
##                         movie Carlos Daniel Alex Ronald Aidde
## 1     Dead pool and Wolverine      5      4    3      5     4
## 2                Inside Out 2      4      5    5      4     5
## 3           Exorcist Believer      4      4    3      4     3
## 4 The Super Mario Bros. Movie      3      4    5      3     4
## 5                     Abigail      3      4    2      3     3
## 6           Mean Girls (2024)      2      3    4      4     3
# Summary of the data frame
summary(df)
##     movie               Carlos        Daniel       Alex           Ronald     
##  Length:6           Min.   :2.0   Min.   :3   Min.   :2.000   Min.   :3.000  
##  Class :character   1st Qu.:3.0   1st Qu.:4   1st Qu.:3.000   1st Qu.:3.250  
##  Mode  :character   Median :3.5   Median :4   Median :3.500   Median :4.000  
##                     Mean   :3.5   Mean   :4   Mean   :3.667   Mean   :3.833  
##                     3rd Qu.:4.0   3rd Qu.:4   3rd Qu.:4.750   3rd Qu.:4.000  
##                     Max.   :5.0   Max.   :5   Max.   :5.000   Max.   :5.000  
##      Aidde      
##  Min.   :3.000  
##  1st Qu.:3.000  
##  Median :3.500  
##  Mean   :3.667  
##  3rd Qu.:4.000  
##  Max.   :5.000
# View the first few rows of the data frame
head(df)
##                         movie Carlos Daniel Alex Ronald Aidde
## 1     Dead pool and Wolverine      5      4    3      5     4
## 2                Inside Out 2      4      5    5      4     5
## 3           Exorcist Believer      4      4    3      4     3
## 4 The Super Mario Bros. Movie      3      4    5      3     4
## 5                     Abigail      3      4    2      3     3
## 6           Mean Girls (2024)      2      3    4      4     3
# Missing Data Strategy
# Load data from SQL database
df <- dbGetQuery(conn, "SELECT * FROM movie_ratings;")

# Ensure all rating columns are numeric
df$Carlos <- as.numeric(df$Carlos)
df$Daniel <- as.numeric(df$Daniel)
df$Alex <- as.numeric(df$Alex)
df$Ronald <- as.numeric(df$Ronald)
df$Aidde <- as.numeric(df$Aidde)

# Check the structure of the data
str(df)
## 'data.frame':    6 obs. of  6 variables:
##  $ movie : chr  "Dead pool and Wolverine" "Inside Out 2" "Exorcist Believer" "The Super Mario Bros. Movie" ...
##  $ Carlos: num  5 4 4 3 3 2
##  $ Daniel: num  4 5 4 4 4 3
##  $ Alex  : num  3 5 3 5 2 4
##  $ Ronald: num  5 4 4 3 3 4
##  $ Aidde : num  4 5 3 4 3 3
# Calculate row means for the relevant columns (ignoring NAs)
row_means <- rowMeans(df[, c("Carlos", "Daniel", "Alex", "Ronald", "Aidde")], na.rm = TRUE)

# Impute missing values with row means
df$Carlos[is.na(df$Carlos)] <- row_means[is.na(df$Carlos)]
df$Daniel[is.na(df$Daniel)] <- row_means[is.na(df$Daniel)]
df$Alex[is.na(df$Alex)] <- row_means[is.na(df$Alex)]
df$Ronald[is.na(df$Ronald)] <- row_means[is.na(df$Ronald)]
df$Aidde[is.na(df$Aidde)] <- row_means[is.na(df$Aidde)]

# Replace NA values with the row median for each column
df$Carlos[is.na(df$Carlos)] <- apply(df[, c("Carlos", "Daniel", "Alex", "Ronald", "Aidde")], 1, function(x) median(x, na.rm = TRUE))
df$Daniel[is.na(df$Daniel)] <- apply(df[, c("Carlos", "Daniel", "Alex", "Ronald", "Aidde")], 1, function(x) median(x, na.rm = TRUE))
df$Alex[is.na(df$Alex)] <- apply(df[, c("Carlos", "Daniel", "Alex", "Ronald", "Aidde")], 1, function(x) median(x, na.rm = TRUE))
df$Ronald[is.na(df$Ronald)] <- apply(df[, c("Carlos", "Daniel", "Alex", "Ronald", "Aidde")], 1, function(x) median(x, na.rm = TRUE))
df$Aidde[is.na(df$Aidde)] <- apply(df[, c("Carlos", "Daniel", "Alex", "Ronald", "Aidde")], 1, function(x) median(x, na.rm = TRUE))

# Check for missing values
colSums(is.na(df))
##  movie Carlos Daniel   Alex Ronald  Aidde 
##      0      0      0      0      0      0
# Check the imputed data
head(df)
##                         movie Carlos Daniel Alex Ronald Aidde
## 1     Dead pool and Wolverine      5      4    3      5     4
## 2                Inside Out 2      4      5    5      4     5
## 3           Exorcist Believer      4      4    3      4     3
## 4 The Super Mario Bros. Movie      3      4    5      3     4
## 5                     Abigail      3      4    2      3     3
## 6           Mean Girls (2024)      2      3    4      4     3
#The Missing Data approach: Imputation was used to handle missing data, allowing the data set to remain complete for analysis. 
#Mean imputation was applied to fill missing values with the average, providing a balanced estimate. 
#Median imputation was  used to account for skewed data, ensuring outliers didn’t distort the results. 
#This combination preserved data integrity and statistical power while addressing missing values effectively.