#1a,b - setting the wd, importing the data, viewing the data
setwd("~/NYU/classes/2. R/Assignments/Lesson 5")
library(readr)
movies <- read_csv("movies.csv")
## Rows: 50 Columns: 8
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): Movie, Release_Date, Distributor, Genre, MPAA
## dbl (3): Rank, Gross_Sales, Tickets_Sold
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(movies)
#1c - viewing the column names
names(movies)
## [1] "Rank" "Movie" "Release_Date" "Distributor" "Genre"
## [6] "MPAA" "Gross_Sales" "Tickets_Sold"
#1d - look at the dimensions of the data
dim(movies)
## [1] 50 8
#2 - scatterplots
plot(movies$Tickets_Sold, movies$Gross_Sales)

plot(movies$Tickets_Sold/1000, movies$Gross_Sales/1000)

plot(movies$Tickets_Sold/100000, movies$Gross_Sales/100000)

plot(movies$Tickets_Sold/1000000, movies$Gross_Sales/1000000)

#naming the variables
movies$Tickets_Sold_millions = movies$Tickets_Sold/1000000
movies$Gross_Sales_millions = movies$Gross_Sales/1000000
#plotting
plot(movies$Tickets_Sold_millions, movies$Gross_Sales_millions, main = "number of tickets sold and gross ticket sales", xlab= "Number of tickets sold (in millions)", ylab = "Gross ticket sales (in millions of $USD)", frame.plot=FALSE, col="#4bcea3")

#3 correlation between tickets sold and sales
cor(movies$Tickets_Sold,movies$Gross_Sales)
## [1] 1
# Answer=1
# is it expected: Yes
#4 scatterplots with lines
#4a.b do a scatter plot with millions scale, add a regression line & labels
plot(movies$Tickets_Sold_millions, movies$Gross_Sales_millions, type="b", main = "number of tickets sold and gross ticket sales", xlab= "Number of tickets sold (in millions)", ylab = "Gross ticket sales (in millions of $USD)", frame.plot=FALSE, col="#4bcea3")

#5 other plots
#5a - box plot
boxplot(movies$Gross_Sales_millions, main ="Top 50 grossing Films of 2024 (in millions of $USD)", frame.plot=FALSE, col="#4cbea3")

#5 other plots
#5b - horizontal box plot
boxplot(movies$Gross_Sales_millions, horizontal=TRUE, main ="Top 50 grossing Films of 2024 (in millions of $USD)", frame.plot=FALSE, col="#4cbea3")

#5c - histogram of type of films
movies$Genre_factor=as.factor(movies$Genre)
hist(as.numeric(movies$Genre_factor), main = "Movies by Genre", xlab= "Genre category", col = "#4cbea3", border = "#FFFFFF")

#5d - histogram of gross sales
hist(movies$Gross_Sales_millions, main = "Movies by Gross Sales", xlab= "Gross Sales", col = "#4cbea3", border = "#FFFFFF")

#5e - histogram of gross sales with 10 bins
hist(movies$Gross_Sales_millions, main = "Movies by Gross Sales", xlab= "Gross Sales", col = "#4cbea3", border = "#FFFFFF", breaks=10)

#5f - histogram of ticket sales
hist(movies$Tickets_Sold_millions, main = "Number of Tickets Sold", xlab= "Tickets sold (millions)", col = "#4cbea3", border = "#FFFFFF", breaks=10)

hist(movies$Tickets_Sold_millions, main = "Number of Tickets Sold", xlab= "Tickets sold (millions)", col = "#4cbea3", border = "#FFFFFF", breaks=25)

#5g - histogram of ticket sales with frequency count at the top of the bars
hist(movies$Tickets_Sold_millions, main = "Number of Tickets Sold", xlab= "Tickets sold (millions)", col = "#4cbea3", border = "#FFFFFF", breaks=15, labels = TRUE)

#5h - bar plot of Genre
movies$Genre_matrix=as.matrix(movies$Genre)
barplot(movies$Genre_matrix, main = "Movies by Genre", xlab= "Genre category", col = "#4cbea3")
## Warning in apply(height, 2L, cumsum): NAs introduced by coercion
