#1a,b - setting the wd, importing the data, viewing the data 
setwd("~/NYU/classes/2. R/Assignments/Lesson 5")
library(readr)
movies <- read_csv("movies.csv")
## Rows: 50 Columns: 8
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): Movie, Release_Date, Distributor, Genre, MPAA
## dbl (3): Rank, Gross_Sales, Tickets_Sold
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(movies)
#1c - viewing the column names
names(movies)
## [1] "Rank"         "Movie"        "Release_Date" "Distributor"  "Genre"       
## [6] "MPAA"         "Gross_Sales"  "Tickets_Sold"
#1d - look at the dimensions of the data
dim(movies)
## [1] 50  8
#2 - scatterplots
plot(movies$Tickets_Sold, movies$Gross_Sales)

plot(movies$Tickets_Sold/1000, movies$Gross_Sales/1000)

plot(movies$Tickets_Sold/100000, movies$Gross_Sales/100000)

plot(movies$Tickets_Sold/1000000, movies$Gross_Sales/1000000)

#naming the variables
movies$Tickets_Sold_millions = movies$Tickets_Sold/1000000
movies$Gross_Sales_millions = movies$Gross_Sales/1000000

#plotting 
plot(movies$Tickets_Sold_millions, movies$Gross_Sales_millions, main = "number of tickets sold and gross ticket sales", xlab= "Number of tickets sold (in millions)", ylab = "Gross ticket sales (in millions of $USD)", frame.plot=FALSE, col="#4bcea3")

#3 correlation between tickets sold and sales
cor(movies$Tickets_Sold,movies$Gross_Sales)
## [1] 1
# Answer=1
# is it expected: Yes 
#4 scatterplots with lines 
#4a.b do a scatter plot with millions scale, add a regression line & labels 
plot(movies$Tickets_Sold_millions, movies$Gross_Sales_millions, type="b", main = "number of tickets sold and gross ticket sales", xlab= "Number of tickets sold (in millions)", ylab = "Gross ticket sales (in millions of $USD)", frame.plot=FALSE, col="#4bcea3")

#5 other plots 
#5a - box plot 
boxplot(movies$Gross_Sales_millions, main ="Top 50 grossing Films of 2024 (in millions of $USD)", frame.plot=FALSE, col="#4cbea3")

#5 other plots 
#5b - horizontal box plot 
boxplot(movies$Gross_Sales_millions, horizontal=TRUE, main ="Top 50 grossing Films of 2024 (in millions of $USD)", frame.plot=FALSE, col="#4cbea3")

#5c - histogram of type of films
movies$Genre_factor=as.factor(movies$Genre)
hist(as.numeric(movies$Genre_factor), main = "Movies by Genre",  xlab= "Genre category", col = "#4cbea3", border = "#FFFFFF")

#5d - histogram of gross sales
hist(movies$Gross_Sales_millions, main = "Movies by Gross Sales",  xlab= "Gross Sales", col = "#4cbea3", border = "#FFFFFF")

#5e - histogram of gross sales with 10 bins
hist(movies$Gross_Sales_millions, main = "Movies by Gross Sales",  xlab= "Gross Sales", col = "#4cbea3", border = "#FFFFFF", breaks=10)

#5f - histogram of ticket sales
hist(movies$Tickets_Sold_millions, main = "Number of Tickets Sold",  xlab= "Tickets sold (millions)", col = "#4cbea3", border = "#FFFFFF", breaks=10)

hist(movies$Tickets_Sold_millions, main = "Number of Tickets Sold",  xlab= "Tickets sold (millions)", col = "#4cbea3", border = "#FFFFFF", breaks=25)

#5g - histogram of ticket sales with frequency count at the top of the bars 
hist(movies$Tickets_Sold_millions, main = "Number of Tickets Sold",  xlab= "Tickets sold (millions)", col = "#4cbea3", border = "#FFFFFF", breaks=15, labels = TRUE)

#5h - bar plot of Genre 
movies$Genre_matrix=as.matrix(movies$Genre)
barplot(movies$Genre_matrix, main = "Movies by Genre",  xlab= "Genre category", col = "#4cbea3")
## Warning in apply(height, 2L, cumsum): NAs introduced by coercion