library(tidyverse)
For this assignment, I surveyed 5 people with the following question:
Please rate these movies on a scale of 1-5, with NA if you did not see them. 1. Deadpool and Wolverine 2. Everything, Everywhere, All at Once 3. Barbie 4. Oppenheimer 5. Spiderman Into the Spiderverse 6. Puss in Boots 2
I created a table in SSMS and then performed some data cleaning to bring the data into a more standardized format. Data was rounded to the nearest integer when decimal ratings were given, and lack of rating for a movie (i.e. when the respondent did not see the movie) was transformed to 0. The 0 will later be transformed to NA in R. If the rating fell out of the standard 1-5 scale, I transformed the data to a specified floor (1) or ceiling (5).
Drop Table if Exists #PopularMovies
Drop Table if Exists PopularMovies
create table #PopularMovies (
RespondentInitials nvarchar(10),
Deadpool3 nvarchar(30),
EverythingEverywhere nvarchar(30),
Barbie nvarchar(30),
Oppenheimer nvarchar(30),
IntotheSpiderverse nvarchar(30),
PussInBoots2 nvarchar(30));
Insert into #PopularMovies
values
('PH','5','4','4','4','5','5'),
('MM','3.5','10000000','NA','3.8','3.5','4'),
('TH','4','4','3','3','5','5'),
('KK','N/A','N/A','4','4','4','N/A'),
('RD','NA','5','3','5','3','NA');
With dataCleaning as (
Select RespondentInitials,
case when isnumeric(Deadpool3) = 1 then round(Deadpool3,0) else '' end as Deadpool3,
case when isnumeric(EverythingEverywhere) = 1 then round(EverythingEverywhere,0) else '' end as EverythingEverywhere,
case when isnumeric(Barbie) = 1 then round(Barbie,0) else '' end as Barbie,
case when isnumeric(Oppenheimer) = 1 then round(Oppenheimer,0) else '' end as Oppenheimer,
case when isnumeric(IntotheSpiderverse) = 1 then round(IntotheSpiderverse,0) else '' end as IntotheSpiderverse,
case when isnumeric(PussInBoots2) = 1 then round(PussInBoots2,0) else '' end as PussInBoots2
from #PopularMovies)
Select RespondentInitials,
case when Deadpool3 = '' then '' when Deadpool3 > 5 then 5 when Deadpool3 < 1 then 1 else Deadpool3 end as Deadpool3,
case when EverythingEverywhere = '' then '' when EverythingEverywhere > 5 then 5 when EverythingEverywhere < 1 then 1 else EverythingEverywhere end as EverythingEverywhere,
case when Barbie = '' then '' when Barbie > 5 then 5 when Barbie < 1 then 1 else Barbie end as Barbie,
case when Oppenheimer = '' then '' when Oppenheimer > 5 then 5 when Oppenheimer < 1 then 1 else Oppenheimer end as Oppenheimer,
case when IntotheSpiderverse = '' then '' when IntotheSpiderverse > 5 then 5 when IntotheSpiderverse < 1 then 1 else IntotheSpiderverse end as IntotheSpiderverse,
case when PussInBoots2 = '' then '' when PussInBoots2 > 5 then 5 when PussInBoots2 < 1 then 1 else PussInBoots2 end as PussInBoots2
into PopularMovies
from dataCleaning
Select * from PopularMovies
PopularMovies <- read.csv("C:\\Users\\Kim\\Documents\\Data607\\DATA_607_Koon_Assignment_2.csv", header = TRUE, sep = ",")
PopularMovies
## RespondentInitials Deadpool3 EverythingEverywhere Barbie Oppenheimer
## 1 PH 5 4 4 4
## 2 MM 4 5 0 4
## 3 TH 4 4 3 3
## 4 KK 0 0 4 4
## 5 RD 0 5 3 5
## IntotheSpiderverse PussInBoots2
## 1 5 5
## 2 4 4
## 3 5 5
## 4 4 0
## 5 3 0
PopularMovies[PopularMovies == 0] <- NA
PopularMovies <- subset(PopularMovies, select = -c(RespondentInitials))
PopularMovies
## Deadpool3 EverythingEverywhere Barbie Oppenheimer IntotheSpiderverse
## 1 5 4 4 4 5
## 2 4 5 NA 4 4
## 3 4 4 3 3 5
## 4 NA NA 4 4 4
## 5 NA 5 3 5 3
## PussInBoots2
## 1 5
## 2 4
## 3 5
## 4 NA
## 5 NA
PopularMoviesSummary <-colMeans(PopularMovies,na.rm=TRUE)
PopularMoviesSummary
## Deadpool3 EverythingEverywhere Barbie
## 4.333333 4.500000 3.500000
## Oppenheimer IntotheSpiderverse PussInBoots2
## 4.000000 4.200000 4.666667
PopularMoviesSummary <- PopularMoviesSummary[order(PopularMoviesSummary,decreasing = TRUE)]
barplot(PopularMoviesSummary, ylim = c(0,5), ylab = "Average Movie Rating", xlab = "Movie")
The results of this exercise are not surprising. Each popular movie scored an average rating of 3.5 or higher. If I were to do this exercise again, I would probably change the structure of the data in SSMS. Instead of having a single table, I would input data as a Movie and rating pair. This would make the data easier to manipulate in R with ggplot rather than having to create column summaries for each Movie column. For example, I would recommend doing the following:
Drop Table if Exists #PopularMovies
Drop Table if Exists PopularMovies
create table #PopularMovies (
RespondentInitials nvarchar(10),
Movie nvarchar(200),
Rating nvarchar(30),);
Insert into #PopularMovies
values
('PH','Deadpool and Wolverine', '5'),
('MM','Deadpool and Wolverine', '3.5'),
('TH','Deadpool and Wolverine', '4'),
('KK','Deadpool and Wolverine', 'N/A'),
('RD','Deadpool and Wolverine', 'NA'),
('PH','Everything, Everywhere, All at Once', '4'),
('MM','Everything, Everywhere, All at Once', '10000000'),
('TH','Everything, Everywhere, All at Once', '4'),
('KK','Everything, Everywhere, All at Once', 'N/A'),
('RD','Everything, Everywhere, All at Once', '5'),
('PH','Barbie', '4'),
('MM','Barbie', 'NA'),
('TH','Barbie', '3'),
('KK','Barbie', '4'),
('RD','Barbie', '3'),
('PH','Oppenheimer', '4'),
('MM','Oppenheimer', '3.8'),
('TH','Oppenheimer', '3'),
('KK','Oppenheimer', '4'),
('RD','Oppenheimer', '5'),
('PH','Spiderman Into the Spiderverse', '5'),
('MM','Spiderman Into the Spiderverse', '3.5'),
('TH','Spiderman Into the Spiderverse', '5'),
('KK','Spiderman Into the Spiderverse', '4'),
('RD','Spiderman Into the Spiderverse', '3'),
('PH','Puss in Boots 2', '5'),
('MM','Puss in Boots 2', '4'),
('TH','Puss in Boots 2', '5'),
('KK','Puss in Boots 2', 'N/A'),
('RD','Puss in Boots 2', 'N.A');
With dataCleaning as (
Select RespondentInitials,
Movie,
case when isnumeric(Rating) = 1 then round(Rating,0) else '' end as Rating
from #PopularMovies)
Select RespondentInitials,
Movie,
case when Rating = '' then '' when Rating > 5 then 5 when Rating < 1 then 1 else Rating end as Rating
into PopularMovies
from dataCleaning
Select * from PopularMovies
PopularMovies <- read.csv("C:\\Users\\Kim\\Documents\\Data607\\DATA_607_Koon_Assignment_2_Retry.csv", header = TRUE, sep = ",")
PopularMovies
## RespondentInitials Movie Rating
## 1 PH Deadpool and Wolverine 5
## 2 MM Deadpool and Wolverine 4
## 3 TH Deadpool and Wolverine 4
## 4 KK Deadpool and Wolverine 0
## 5 RD Deadpool and Wolverine 0
## 6 PH Everything, Everywhere, All at Once 4
## 7 MM Everything, Everywhere, All at Once 5
## 8 TH Everything, Everywhere, All at Once 4
## 9 KK Everything, Everywhere, All at Once 0
## 10 RD Everything, Everywhere, All at Once 5
## 11 PH Barbie 4
## 12 MM Barbie 0
## 13 TH Barbie 3
## 14 KK Barbie 4
## 15 RD Barbie 3
## 16 PH Oppenheimer 4
## 17 MM Oppenheimer 4
## 18 TH Oppenheimer 3
## 19 KK Oppenheimer 4
## 20 RD Oppenheimer 5
## 21 PH Spiderman Into the Spiderverse 5
## 22 MM Spiderman Into the Spiderverse 4
## 23 TH Spiderman Into the Spiderverse 5
## 24 KK Spiderman Into the Spiderverse 4
## 25 RD Spiderman Into the Spiderverse 3
## 26 PH Puss in Boots 2 5
## 27 MM Puss in Boots 2 4
## 28 TH Puss in Boots 2 5
## 29 KK Puss in Boots 2 0
## 30 RD Puss in Boots 2 0
PopularMovies[PopularMovies == 0] <- NA
PopularMovies
## RespondentInitials Movie Rating
## 1 PH Deadpool and Wolverine 5
## 2 MM Deadpool and Wolverine 4
## 3 TH Deadpool and Wolverine 4
## 4 KK Deadpool and Wolverine NA
## 5 RD Deadpool and Wolverine NA
## 6 PH Everything, Everywhere, All at Once 4
## 7 MM Everything, Everywhere, All at Once 5
## 8 TH Everything, Everywhere, All at Once 4
## 9 KK Everything, Everywhere, All at Once NA
## 10 RD Everything, Everywhere, All at Once 5
## 11 PH Barbie 4
## 12 MM Barbie NA
## 13 TH Barbie 3
## 14 KK Barbie 4
## 15 RD Barbie 3
## 16 PH Oppenheimer 4
## 17 MM Oppenheimer 4
## 18 TH Oppenheimer 3
## 19 KK Oppenheimer 4
## 20 RD Oppenheimer 5
## 21 PH Spiderman Into the Spiderverse 5
## 22 MM Spiderman Into the Spiderverse 4
## 23 TH Spiderman Into the Spiderverse 5
## 24 KK Spiderman Into the Spiderverse 4
## 25 RD Spiderman Into the Spiderverse 3
## 26 PH Puss in Boots 2 5
## 27 MM Puss in Boots 2 4
## 28 TH Puss in Boots 2 5
## 29 KK Puss in Boots 2 NA
## 30 RD Puss in Boots 2 NA
PopularMoviesSummary <- PopularMovies %>%
group_by(Movie) %>%
mutate(averageRatings = mean(Rating, na.rm = TRUE))
PopularMoviesSummary <- subset(PopularMoviesSummary, select = -c(RespondentInitials,Rating)) %>%
distinct()
PopularMoviesSummary
## # A tibble: 6 × 2
## # Groups: Movie [6]
## Movie averageRatings
## <chr> <dbl>
## 1 Deadpool and Wolverine 4.33
## 2 Everything, Everywhere, All at Once 4.5
## 3 Barbie 3.5
## 4 Oppenheimer 4
## 5 Spiderman Into the Spiderverse 4.2
## 6 Puss in Boots 2 4.67
PopularMovies %>%
group_by(Movie) %>%
summarise(averageRatings = mean(Rating, na.rm = TRUE)) %>%
ggplot(aes( x = averageRatings, y = reorder(Movie,-averageRatings)),na.rm = TRUE) +
geom_col() + geom_text(aes(label = round(averageRatings,1)),
position = position_dodge(width = 1),
hjust = -.1, size = 3) +
labs(title = "Average Movie Ratings among Respondents on a 5 Point Scale",
x = "Average Rating", y = "Movie")