Load the different packages:
library(bitops)
library(stringr)
library(XML)
library(RCurl)
library(RMySQL)
library(reshape2)
library(ggplot2)
inputdir="C:/Users/talha/Documents/Training/CUNY Classes/IS607/Week2"
outputdir="C:/Users/talha/Documents/Training/CUNY Classes/IS607/Week2"
we load the data on movies from (the web) using BoxOffice Mojo
#load URLs
box<-getURL("http://www.boxofficemojo.com/quarterly/?view=releasedate&yr=2016&quarter=Q3")
# Parse the URLs
box_parsed<-htmlParse(box,encoding="UTF-8")
tables<-readHTMLTable(box_parsed,stringsAsFactors=FALSE)
The scrapped data require some cleaning and reformainting
#Select and clean the different tables
movie_table<-tables[[4]]
str(movie_table)
## 'data.frame': 105 obs. of 9 variables:
## $ V1: chr "Filter26 Aries5&2A23A24Abr.AEFAlcArgo.BGBSMBSTBVCineGalaxyCJCLCohenCol.Collective ECrnthCZDaltonDistrib.Drft.ECErosESXFBFCWFIPF"| __truncated__ "1" "2" "3" ...
## $ V2: chr NA "The Secret Life of Pets" "Suicide Squad" "Jason Bourne" ...
## $ V3: chr NA "Uni." "WB" "Uni." ...
## $ V4: chr NA "$363,350,370" "$313,782,332" "$160,397,780" ...
## $ V5: chr NA "4,381" "4,255" "4,039" ...
## $ V6: chr NA "$104,352,905" "$133,682,248" "$59,215,365" ...
## $ V7: chr NA "4,370" "4,255" "4,026" ...
## $ V8: chr NA "7/8" "8/5" "7/29" ...
## $ V9: chr NA "-" "-" "-" ...
# clean and name the different columns
movie_table<-movie_table[2:101,]
colnames(movie_table)<-c("Rank","Title","Studio","Gross_Q3","Total_Theaters","Opening","Open_theatre","Open_date","close_date")
# all the data are strings and need to be converted to numeric
movie_table$Gross_Q3<-str_replace_all(movie_table$Gross_Q3,"[$,]","")
movie_table$Opening<-str_replace_all(movie_table$Opening,"[$,]","")
movie_table$Total_Theaters<-str_replace_all(movie_table$Total_Theaters,"[,]","")
movie_table$Open_theatre<-str_replace_all(movie_table$Open_theatre,"[,]","")
# convert to numeric
movie_table$Gross_Q3<-as.numeric(movie_table$Gross_Q3)
movie_table$Total_Theaters<-as.numeric(movie_table$Total_Theaters)
movie_table$Opening<-as.numeric(movie_table$Opening)
## Warning: NAs introduced by coercion
movie_table$Rank<-as.numeric(movie_table$Rank)
movie_table$Gross_Q3<-movie_table$Gross_Q3/1000000
movie_table$Opening<-movie_table$Opening/1000000
movie_table[1:10,]
## Rank Title Studio Gross_Q3 Total_Theaters Opening
## 2 1 The Secret Life of Pets Uni. 363.35037 4381 104.35291
## 3 2 Suicide Squad WB 313.78233 4255 133.68225
## 4 3 Jason Bourne Uni. 160.39778 4039 59.21536
## 5 4 Star Trek Beyond Par. 157.55369 3928 59.25321
## 6 5 Ghostbusters (2016) Sony 127.05959 3963 46.01875
## 7 6 The Legend of Tarzan WB 126.64306 3591 38.52786
## 8 7 Bad Moms STX 110.03393 3215 23.81734
## 9 8 Sausage Party Sony 95.35986 3135 34.26353
## 10 9 The Purge: Election Year Uni. 79.04244 2821 31.51511
## 11 10 Don't Breathe SGem 75.32878 3384 26.41171
## Open_theatre Open_date close_date
## 2 4370 7/8 -
## 3 4255 8/5 -
## 4 4026 7/29 -
## 5 3928 7/22 -
## 6 3963 7/15 -
## 7 3561 7/1 9/15
## 8 3215 7/29 -
## 9 3103 8/12 -
## 10 2796 7/1 8/18
## 11 3051 8/26 -
The top ranked movies are then exported as a table to be used to develop the survey and input into the database
movie_names<-subset(movie_table,Rank<=10,Rank:Title)
write.table(movie_names,file.path(outputdir,"movie_names.csv"),row.names=FALSE,col.names=FALSE,quote=TRUE,sep=",",eol="\n")
Survey can be accessed at the link below https://www.surveymonkey.com/r/WJQ6PBG
The survey is conducted for six respondents and export the data. The data is exported and a SQL database is created. Please see MySQL code.
Using the SQL database we run the query in R.
rmysql.settingsfile<-"C:/Program Files/MySQL/MySQL Server 5.0/my.ini"
con <- dbConnect(RMySQL::MySQL(), dbname = "moviesurvey", username="root", password="password")
surveydata<-dbGetQuery(con,"SELECT * from survey")
dbDisconnect(con)
## [1] TRUE
Develop some initial exploratory plots
survey_reshape<-dcast(surveydata[,2:4],movie_id~survey_id,value.var="score")
survey_reshape$avgrank<-apply(survey_reshape[,2:7],1,mean)
survey_combined<-merge(movie_table,survey_reshape,by.x="Rank",by.y="movie_id")
# Develop some plots
ggplot(survey_combined,aes(avgrank,Gross_Q3,color=Studio, label=Title))+geom_point(size=3)+xlab("Average Survey Rank")+ylab("Q3 $ Millions")+ggtitle("Movie Rankings and Box Office Success")