data <- read.csv(“C:/Users/lohit allaparti/Downloads/imdb.csv”)

Group_By

library(dplyr)

grouped_genre <- data %>% group_by(genre) %>% summarize(mean_score = mean(score, na.rm = TRUE)) print(grouped_genre)

grouped_status <- data %>% group_by(status) %>% summarize(mean_revenue = mean(revenue, na.rm = TRUE)) print(grouped_status)

grouped_orig_lang <- data %>% group_by(orig_lang) %>% summarize(mean_budget = mean(budget_x, na.rm = TRUE)) print(grouped_orig_lang)

( After running this code we will have three separate summary tables (grouped_genre, grouped_status, and grouped_orig_lang) )

Function to calculate probabilities and assign “anomaly” tag

calculate_probabilities <- function(grouped_data, column_name) { grouped_data <- grouped_data %>% mutate(probability = n() / sum(n()))

min_probability <- min(grouped_data$probability)

grouped_data <- grouped_data %>% mutate(anomaly_tag = ifelse(probability == min_probability, “Anomaly”, “Normal”))

return(grouped_data) }

grouped_genre <- calculate_probabilities(grouped_genre, “genre”) grouped_status <- calculate_probabilities(grouped_status, “status”) grouped_orig_lang <- calculate_probabilities(grouped_orig_lang, “orig_lang”)

print(grouped_genre) print(grouped_status) print(grouped_orig_lang)

(This above code is used to calculate probabilities and assign an “anomaly” tag to different groups within each of the three categorical variables: ‘genre,’ ‘status,’ and ‘orig_lang.’)

(# Printing the “anomaly_tag” columns in grouped data frames print(grouped_genre) print(grouped_status) print(grouped_orig_lang)

(This above code is used to print the contents of the “anomaly_tag” columns in the grouped data frames grouped_genre, grouped_status, and grouped_orig_lang.)

Create grouped data frames with unique “anomaly_tag” column names

grouped_genre <- grouped_genre %>% mutate(genre_anomaly = ifelse(probability == min(probability), “Anomaly”, “Normal”))

grouped_status <- grouped_status %>% mutate(status_anomaly = ifelse(probability == min(probability), “Anomaly”, “Normal”))

grouped_orig_lang <- grouped_orig_lang %>% mutate(orig_lang_anomaly = ifelse(probability == min(probability), “Anomaly”, “Normal”))

Left join “anomaly_tag” information to the original data frame

data <- data %>% left_join(select(grouped_genre, genre, genre_anomaly), by = c(“genre” = “genre”)) %>% left_join(select(grouped_status, status, status_anomaly), by = c(“status” = “status”)) %>% left_join(select(grouped_orig_lang, orig_lang, orig_lang_anomaly), by = c(“orig_lang” = “orig_lang”))

head(data)

(This above code is used to create grouped data frames (grouped_genre, grouped_status, and grouped_orig_lang) with unique “anomaly_tag” column names (genre_anomaly, status_anomaly, and orig_lang_anomaly) and then left join this anomaly information back to the original data frame data)

Create a bar plot to visualize anomaly tags by genre

library(ggplot2)

ggplot(data, aes(x = genre, fill = genre_anomaly)) + geom_bar() + labs(title = “Distribution of Anomaly Tags by Genre”, x = “Genre”, y = “Count”) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

(This above code generates a bar plot using the ggplot2 library to visualize the distribution of anomaly tags by genre.)

Creating a box plot to compare “score” for normal and anomaly groups

ggplot(data, aes(x = genre_anomaly, y = score)) + geom_boxplot() + labs(title = “Comparison of Scores for Normal and Anomaly Groups”, x = “Anomaly Tag”, y = “Score”) + theme_minimal()

(This above code generates a box plot using the ggplot2 library to compare the scores between normal and anomaly groups based on the “genre_anomaly” column)

Creating a pie chart to visualize the proportion of anomaly tags by status

ggplot(status_proportions, aes(x = ““, y = count, fill = status_anomaly)) + geom_bar(stat =”identity”) + coord_polar(“y”, start = 0) + labs(title = “Proportion of Anomaly Tags by Status”, fill = “Anomaly Tag”) + theme_void()

Code to generate all combinations

combinations <- expand.grid( Genre = unique(data$genre), Status = unique(data$status), Orig_Lang = unique(data$orig_lang) )

Count combinations in the dataset

combination_counts <- table(data$genre, data$status, data$orig_lang)

missing_combinations <- which(combination_counts == 0)

most_common_combinations <- which(combination_counts == max(combination_counts)) least_common_combinations <- which(combination_counts == min(combination_counts))

Missing Combinations

cat(“Missing Combinations:”) for (row_index in missing_combinations) { combination <- combinations[row_index, ] cat(paste(names(combination), combination, sep = ” = “),”“) } (The above code prints out the missing combinations, .) # Most common combinations cat(”Common Combinations:“) for (row_index in most_common_combinations) { combination <- combinations[row_index, ] cat(paste(names(combination), combination, sep =” = “),”“) }

(The above code prints out the common Combination)

Least Common combinations

cat(“Common Combinations:”) for (row_index in least_common_combinations) { combination <- combinations[row_index, ] cat(paste(names(combination), combination, sep = ” = “),”“) } (The above code prints out the least common combinations)

LohitDATADIVE3

2023-09-10