data <- read.csv(“C:/Users/lohit allaparti/Downloads/imdb.csv”)
library(dplyr)
grouped_genre <- data %>% group_by(genre) %>% summarize(mean_score = mean(score, na.rm = TRUE)) print(grouped_genre)
grouped_status <- data %>% group_by(status) %>% summarize(mean_revenue = mean(revenue, na.rm = TRUE)) print(grouped_status)
grouped_orig_lang <- data %>% group_by(orig_lang) %>% summarize(mean_budget = mean(budget_x, na.rm = TRUE)) print(grouped_orig_lang)
( After running this code we will have three separate summary tables (grouped_genre, grouped_status, and grouped_orig_lang) )
calculate_probabilities <- function(grouped_data, column_name) { grouped_data <- grouped_data %>% mutate(probability = n() / sum(n()))
min_probability <- min(grouped_data$probability)
grouped_data <- grouped_data %>% mutate(anomaly_tag = ifelse(probability == min_probability, “Anomaly”, “Normal”))
return(grouped_data) }
grouped_genre <- calculate_probabilities(grouped_genre, “genre”) grouped_status <- calculate_probabilities(grouped_status, “status”) grouped_orig_lang <- calculate_probabilities(grouped_orig_lang, “orig_lang”)
print(grouped_genre) print(grouped_status) print(grouped_orig_lang)
(This above code is used to calculate probabilities and assign an “anomaly” tag to different groups within each of the three categorical variables: ‘genre,’ ‘status,’ and ‘orig_lang.’)
(# Printing the “anomaly_tag” columns in grouped data frames print(grouped_genre) print(grouped_status) print(grouped_orig_lang)
(This above code is used to print the contents of the “anomaly_tag” columns in the grouped data frames grouped_genre, grouped_status, and grouped_orig_lang.)
grouped_genre <- grouped_genre %>% mutate(genre_anomaly = ifelse(probability == min(probability), “Anomaly”, “Normal”))
grouped_status <- grouped_status %>% mutate(status_anomaly = ifelse(probability == min(probability), “Anomaly”, “Normal”))
grouped_orig_lang <- grouped_orig_lang %>% mutate(orig_lang_anomaly = ifelse(probability == min(probability), “Anomaly”, “Normal”))
data <- data %>% left_join(select(grouped_genre, genre, genre_anomaly), by = c(“genre” = “genre”)) %>% left_join(select(grouped_status, status, status_anomaly), by = c(“status” = “status”)) %>% left_join(select(grouped_orig_lang, orig_lang, orig_lang_anomaly), by = c(“orig_lang” = “orig_lang”))
head(data)
(This above code is used to create grouped data frames (grouped_genre, grouped_status, and grouped_orig_lang) with unique “anomaly_tag” column names (genre_anomaly, status_anomaly, and orig_lang_anomaly) and then left join this anomaly information back to the original data frame data)
ggplot(data, aes(x = genre_anomaly, y = score)) + geom_boxplot() + labs(title = “Comparison of Scores for Normal and Anomaly Groups”, x = “Anomaly Tag”, y = “Score”) + theme_minimal()
(This above code generates a box plot using the ggplot2 library to compare the scores between normal and anomaly groups based on the “genre_anomaly” column)
combinations <- expand.grid( Genre = unique(data\(genre), Status = unique(data\)status), Orig_Lang = unique(data$orig_lang) )
combination_counts <- table(data\(genre, data\)status, data$orig_lang)
missing_combinations <- which(combination_counts == 0)
most_common_combinations <- which(combination_counts == max(combination_counts)) least_common_combinations <- which(combination_counts == min(combination_counts))
cat(“Missing Combinations:”) for (row_index in missing_combinations) { combination <- combinations[row_index, ] cat(paste(names(combination), combination, sep = ” = “),”“) } (The above code prints out the missing combinations, .) # Most common combinations cat(”Common Combinations:“) for (row_index in most_common_combinations) { combination <- combinations[row_index, ] cat(paste(names(combination), combination, sep =” = “),”“) }
(The above code prints out the common Combination)
cat(“Common Combinations:”) for (row_index in least_common_combinations) { combination <- combinations[row_index, ] cat(paste(names(combination), combination, sep = ” = “),”“) } (The above code prints out the least common combinations)