## Warning: package 'ggplot2' was built under R version 3.4.1
## Warning: package 'ggthemes' was built under R version 3.4.1
## Warning: package 'scales' was built under R version 3.4.1
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'mice' was built under R version 3.4.2
## Loading required package: lattice
## Warning: package 'randomForest' was built under R version 3.4.1
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## Warning: package 'rpart' was built under R version 3.4.2
## Warning: package 'ROCR' was built under R version 3.4.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.1
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
## Warning: package 'rpart.plot' was built under R version 3.4.2
## Warning: package 'corrr' was built under R version 3.4.1
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
## Warning: package 'glue' was built under R version 3.4.2
##
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
##
## collapse
## Warning: package 'caTools' was built under R version 3.4.1
## Warning: package 'data.table' was built under R version 3.4.2
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## Loading required package: knitr
## Warning: package 'knitr' was built under R version 3.4.2
## Loading required package: geosphere
## Warning: package 'geosphere' was built under R version 3.4.2
## Loading required package: gmapsdistance
## Warning: package 'gmapsdistance' was built under R version 3.4.2
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.4.2
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:mice':
##
## complete
## Warning: package 'car' was built under R version 3.4.2
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Warning: package 'caret' was built under R version 3.4.1
## Warning: package 'gclus' was built under R version 3.4.1
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 3.4.2
## Warning: package 'visdat' was built under R version 3.4.1
## Warning: package 'psych' was built under R version 3.4.2
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:scales':
##
## alpha, rescale
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## Warning: package 'leaflet' was built under R version 3.4.1
## Warning: package 'leaflet.extras' was built under R version 3.4.1
## Warning: package 'PerformanceAnalytics' was built under R version 3.4.2
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.4.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.1
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following object is masked from 'package:leaflet':
##
## addLegend
## The following objects are masked from 'package:data.table':
##
## first, last
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:gplots':
##
## textplot
## The following object is masked from 'package:graphics':
##
## legend
## Warning: package 'GPArotation' was built under R version 3.4.1
## Warning: package 'MVN' was built under R version 3.4.2
## sROC 0.1-2 loaded
##
## Attaching package: 'MVN'
## The following object is masked from 'package:psych':
##
## mardia
## Warning: package 'MASS' was built under R version 3.4.1
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Warning: package 'psy' was built under R version 3.4.1
##
## Attaching package: 'psy'
## The following object is masked from 'package:psych':
##
## wkappa
## Warning: package 'corpcor' was built under R version 3.4.1
## Warning: package 'fastmatch' was built under R version 3.4.1
##
## Attaching package: 'fastmatch'
## The following object is masked from 'package:dplyr':
##
## coalesce
## Warning: package 'plyr' was built under R version 3.4.1
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## Warning: package 'ggcorrplot' was built under R version 3.4.2
Read the File and do the descriptive analystics
bollywood_song <- read.csv("bollywod_clustering.csv")
plot(bollywood_song)
str(bollywood_song)
## 'data.frame': 149 obs. of 8 variables:
## $ Movie_Name : Factor w/ 149 levels "2 States","3G",..: 57 7 148 100 54 104 60 51 67 48 ...
## $ Box_Office_Collection: int 27 25 14 32 22 34 2 14 12 28 ...
## $ Profit : int 15 7 -6 19 -27 -6 -11 -13 7 12 ...
## $ Earning_Ratio : num 2.3 1.39 0.7 2.47 0.45 0.85 0.13 0.53 2.64 1.76 ...
## $ Budget : int 12 18 20 13 48 40 12 27 5 16 ...
## $ Youtube_Views : int 13798789 8788913 6698987 5592977 5192338 5176897 4823892 4687259 4674795 4321162 ...
## $ Youtube_Likes : int 15708 18907 6750 15464 6928 10672 5797 13219 3706 10126 ...
## $ Youtube_Dislikes : int 5226 2940 2234 1513 1120 1392 227 1605 762 964 ...
summary(bollywood_song)
## Movie_Name Box_Office_Collection Profit
## 2 States : 1 Min. : 0.00 Min. :-57.00
## 3G : 1 1st Qu.: 9.00 1st Qu.: -7.00
## Aashiqui 2 : 1 Median : 28.00 Median : 3.00
## Aatma : 1 Mean : 55.72 Mean : 26.22
## ABCD <U+0096> Any Body Can Dance: 1 3rd Qu.: 57.00 3rd Qu.: 27.00
## Action Jackson : 1 Max. :735.00 Max. :650.00
## (Other) :143
## Earning_Ratio Budget Youtube_Views Youtube_Likes
## Min. :0.010 Min. : 2.00 Min. : 4354 Min. : 1
## 1st Qu.:0.530 1st Qu.: 11.00 1st Qu.: 1076591 1st Qu.: 1377
## Median :1.200 Median : 21.00 Median : 2375050 Median : 4111
## Mean :1.769 Mean : 29.44 Mean : 3337920 Mean : 7878
## 3rd Qu.:2.390 3rd Qu.: 35.00 3rd Qu.: 4550051 3rd Qu.: 9100
## Max. :9.170 Max. :150.00 Max. :23171067 Max. :101275
##
## Youtube_Dislikes
## Min. : 1
## 1st Qu.: 189
## Median : 614
## Mean : 1208
## 3rd Qu.: 1419
## Max. :11888
##
par(mfrow=c(1, 2))
boxplot(bollywood_song$Profit, outline= TRUE, col = "blue" , main = "Profit")
boxplot(bollywood_song$Budget, outline= TRUE, col = "blue" , main = "Budget")
boxplot(bollywood_song$Youtube_Likes, outline= TRUE, col = "blue" , main = "Youtube Likes")
boxplot(bollywood_song$Youtube_Views, outline= TRUE, col = "blue" , main = "Youtibe Views")
boxplot(bollywood_song$Youtube_Dislikes, outline= TRUE, col = "blue" , main = "Youube Dislikes")
Do the clustering now with K means.Have done the scaling after removing the factor variable from the data frame. Scaling is must before k means to remove the biasness of the variable having higher range values
bollywood_song.RCDF <- scale(bollywood_song[,2:8])
bollywood_song.cluster <- kmeans(bollywood_song.RCDF, centers = 3, nstart = 2, iter.max = 10) #we keep number of iter.max=15 to ensure the algorithm converges and nstart=50 to #ensure that atleat 4 random sets are choosen
attributes(bollywood_song.cluster) # see the cluster attributes
## $names
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
##
## $class
## [1] "kmeans"
mystand <- scale(bollywood_song[-1])
bollywood_song.cluster$size
## [1] 109 5 35
names(bollywood_song.cluster)
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
bollywood_song.cluster$withinss
## [1] 119.44345 84.80614 219.37064
bollywood_song.cluster$betweenss
## [1] 612.3798
bollywood_song.cluster$totss
## [1] 1036
bollywood_song.cluster$centers
## Box_Office_Collection Profit Earning_Ratio Budget Youtube_Views
## 1 -0.3621454 -0.3166468 -0.3064538 -0.3253091 -0.3789042
## 2 3.9299657 3.6381023 1.2716440 2.9590305 3.4826619
## 3 0.5664005 0.4663997 0.7727211 0.5903869 0.6824929
## Youtube_Likes Youtube_Dislikes
## 1 -0.3483416 -0.3671633
## 2 4.3475571 4.1193989
## 3 0.4637555 0.5549658
Now we will try Elbow Metho
The elbow method looks at the percentage of variance explained as a function of the number of clusters: One should choose a number of clusters so that adding another cluster doesn’t give much better modeling of the data. More precisely, if one plots the percentage of variance explained by the clusters against the number of clusters, the first clusters will add much information (explain a lot of variance), but at some point the marginal gain will drop, giving an angle in the graph. The number of clusters is chosen at this point, hence the “elbow criterion”. This “elbow” cannot always be unambiguously identified. Steps for Elbow methos are: Define maximum number of cluster Run in into a loop for each cluster and Calculate the variance within the group..(This is because aim of the clustering is to ) Plot the no of cluster and wss Clustter Elbow diagram shows that optimal number of cluster can be taken as 6 and then we can proeed.
maxcluster <- 10
initial <-2
set.seed(145)
mywss <- (nrow(bollywood_song.RCDF)-1)*sum(apply(bollywood_song.RCDF,2,var)) ## Here we are calculating degrees of freedom
for (ilimit in initial:maxcluster){
myoss <- kmeans(bollywood_song.RCDF, centers = ilimit, nstart = 2, iter.max = 10)
mywss[ilimit] <- sum(myoss$withinss)
}
plot(1:maxcluster, mywss,xlab="Number of Clusters", ylab="Within groups sum of squares")
#clusplot(mystand,bollywood_song.cluster)
Now we will try with 6 clustes..Key obsernation from earlier cluster analysis (with 3) and current cluster (with cluster 6) are BetweenSS has been increased from 612.9031 to 764.6206. This means clyusters have become more clear WithinSS minimum value has come down from 84.80614 to 20.37320; maximum value has come down from 244 to 84 TotalSS remained same and it is because our dataset remained same and only no of cluster have been changed
song.revised.clusterr <- kmeans(bollywood_song.RCDF, centers = 6, nstart = 2, iter.max = 10)
song.revised.clusterr$size
## [1] 55 42 24 5 15 8
names(song.revised.clusterr)
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
song.revised.clusterr$withinss
## [1] 20.37320 37.54567 29.64963 84.80614 59.87710 39.12767
song.revised.clusterr$betweenss
## [1] 764.6206
song.revised.clusterr$totss
## [1] 1036
song.revised.clusterr$centers
## Box_Office_Collection Profit Earning_Ratio Budget Youtube_Views
## 1 -0.49334993 -0.36711999 -0.524730196 -0.6241513 -0.64790254
## 2 -0.20165693 -0.34533849 -0.432114705 0.2929152 -0.06034495
## 3 -0.09139799 0.05514269 0.766478738 -0.4583526 0.06578531
## 4 3.92996575 3.63810229 1.271644015 2.9590305 3.48266195
## 5 0.39744600 0.06799541 0.008390506 1.1411479 1.25152401
## 6 1.52323379 1.77024361 2.766176373 0.1392467 0.05051378
## Youtube_Likes Youtube_Dislikes
## 1 -0.51076562 -0.5095289
## 2 -0.15822881 -0.1477051
## 3 -0.02636968 -0.1282514
## 4 4.34755710 4.1193989
## 5 0.82261983 1.1313520
## 6 0.16168854 -0.0326923
Now let us understand the different cluster and their distribution through visualisation..Note here that first we are converting the cluster into factor and then storing the cluster id into our data frame. This means now all the data record has their corresponding cluster populated
Point diagram shows all the 6 cluster
# Now lets do the ggplot...
bollywood_song$Cluster <- as.factor(song.revised.clusterr$cluster)
ggplot(data = bollywood_song, aes(bollywood_song$Youtube_Views,bollywood_song$Youtube_Likes,colour = bollywood_song$Cluster)) +
geom_point() +
scale_x_discrete( name = "Youtube Views") +
scale_y_continuous( name = "Youtibe Likes") +
ggtitle("Youtube by Cluster")