This report analyses the demographic prediction results of Boosted Decision Tree Model trained on the Panel_US data after conversion of ratings on quadratic scale [1-100].
library(ggplot2)
library(reshape)
### load data
predictions <- read.csv("BDT_US_Quadratic.csv")
predictions$RescaledScore <- as.numeric(predictions$RescaledScore)
### split the demo column into two based on spaces
predictions = transform(predictions, new = colsplit(predictions$Demo, split = " ", names = c('Gender', 'Age')))
predictions$title_id <- NULL
predictions$Demo <- NULL
predictions <- setNames(predictions, c("title","score","gender", "age"))
predictions <- predictions[c('title','gender','age','score')]
names(predictions)
## [1] "title" "gender" "age" "score"
Top 20 rows of the result set:
## Print top 20 rows
head(predictions,12)
## title gender age score
## 1 Suspects Male [65+] 63.81440
## 2 Suspects Male [45-44] 62.25505
## 3 Suspects Male [18-24] 70.24672
## 4 Suspects Female [55-64] 66.60513
## 5 Suspects Male [25-34] 66.60513
## 6 Suspects Male [35-44] 66.60513
## 7 Suspects Female [25-34] 66.60513
## 8 Suspects Male [55-64] 57.72056
## 9 Suspects Female [45-44] 73.96347
## 10 Suspects Female [18-24] 66.60513
## 11 Suspects Female [65+] 68.43636
## 12 Suspects Female [35-44] 66.60513
A summary of the prediction set:
## summarize the data
summary(predictions)
## title gender age
## 10 Things I Hate About You : 12 Female:9144 [18-24]:3048
## 100 Deeds For Eddie Mcdowd : 12 Male :9144 [25-34]:3048
## 14 Diaries Of The Great War: 12 [35-44]:3048
## 1600 Penn : 12 [45-44]:3048
## 17 Kids And Counting : 12 [55-64]:3048
## 2 Broke Girls : 12 [65+] :3048
## (Other) :18216
## score
## Min. : 1.00
## 1st Qu.: 51.77
## Median : 57.49
## Mean : 56.42
## 3rd Qu.: 62.18
## Max. :100.00
##
Count of Unique Titles in the Test Set:
## count of unique titles in the result set
length(unique(predictions$title))
## [1] 1524
Histograpm of Predicted Scores:
hist(predictions$score, col=4)
The density of the Predicted Scores:
## Check density of scores
d <- density(predictions$score)
plot(d, col=4)
Distributional behaviour of predicted scores:
## Check density of scores
boxplot(predictions$score, col=4)
Mean Score of each demo bracket:
## get the mean demo scores for the result set
demo <- aggregate(predictions$score, by = predictions[c('gender','age')], mean)
## Sort the data by Gender Values
demo[order(demo$gender),]
## gender age x
## 1 Female [18-24] 56.67109
## 3 Female [25-34] 56.67109
## 5 Female [35-44] 56.67109
## 7 Female [45-44] 63.67327
## 9 Female [55-64] 56.67109
## 11 Female [65+] 58.41254
## 2 Male [18-24] 60.13491
## 4 Male [25-34] 56.67109
## 6 Male [35-44] 56.67109
## 8 Male [45-44] 52.53745
## 10 Male [55-64] 48.23356
## 12 Male [65+] 54.01868
Pie Chart Age brackets by the sum of Scores:
### Create a pie chart of the demo ages by percentage based on prediction scores
# create an aggregate view of age brackets by sum of scores
aggregation.age <- aggregate(predictions$score, by = predictions[c('gender','age')], sum)
# generate percentages and draw the pie chart
slices <- as.integer(aggregation.age$x)
lbls <- aggregation.age$age
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels = lbls, col=rainbow(length(lbls)),main="Age Brackets By Sum of Scores")
Bar chart indicating the ranked Age brackets by the sum of scores:
### aggregate the age brackets by the sum of scores
aggregation.age <- aggregation.age[order(-aggregation.age$x),]
# draw Bar chart
bp<- ggplot(aggregation.age, aes(x="", y=x, fill =age))+geom_bar(width = 1, stat = "identity")
bp
Pie Chart indicating Gender by the sum of Scores:
### aggregate the gender by the sum of scores
aggregation.gender <- aggregate(predictions$score, by = predictions['gender'], sum)
### Create a pie chart of the demo ages by percentage based on prediction scores
slices <- as.integer(aggregation.gender$x)
lbls <- aggregation.gender$gender
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
# draw pie chart
pie(slices,labels = lbls, col = c("violetred1", "blue"),main="Pie Chart of Age Brackets By Sum of Scores")
Bar chart indicating the gender by the sum of scores:
# aggregate gender data by sum of scores
aggregation.gender <- aggregation.gender[order(-aggregation.gender$x),]
# rename new column
names(aggregation.gender)[names(aggregation.gender)=="x"] <- "Score_Sum"
# draw Bar chart
bp<- ggplot(aggregation.gender, aes(x=gender, y=Score_Sum, fill =gender))+geom_bar(width = 1, stat = "identity")
bp
Heat map of Demo by predictions scores on all titles:
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:reshape':
##
## colsplit, melt, recast
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
# sett the sorting order
predictions$gender <- factor(predictions$gender, levels = c("Female", "Male"))
predictions$age <- factor(predictions$age, levels = c("[18-24]","[25-34]","[35-44]","[45-44]","[55-64]","[65+]"))
#sort the data based on gender and age for each title
predictions <- predictions[order(predictions$title, predictions$gender,predictions$age),]
### dcast the data to convert into matrix of Title, Demo
demo.data <- dcast(predictions, title~gender+age)
## Using score as value column: use value.var to override.
# set titles as row names and get rid of this column
row.names(demo.data) <- demo.data$title
dim(demo.data)
## [1] 1524 13
# get the numberic data only
demo.data <- demo.data[,2:13]
# convert to a numeric matrix
demo_matrix <- data.matrix(demo.data)
# create a data subset for visualization
demo.sub <- head(demo_matrix, 30)
# creates a own color palette from red to green
my_palette <- colorRampPalette(c("yellow", "green", "red"))
heatmap.2(demo.sub, scale = "none", col=my_palette, trace="none", dendrogram=c("none"), symm=F,symkey=F,symbreaks=T, cexCol=0.7, cexRow=0.6,density.info="histogram",breaks = seq(15,80), key= T, key.xlab="Predeiction_Score")
Titles By Gender Split:
aggregation.gender <- aggregate(predictions$score, by = predictions[c('title','gender')], sum)
names(aggregation.gender)[names(aggregation.gender)=="x"] <- "gender.score"
# order the data by title, gender and score
aggregation.gender <- aggregation.gender[order(aggregation.gender$title, aggregation.gender$gender,aggregation.gender$gender.score),]
#plot the stacked bar chart
ggplot(data = head(aggregation.gender,100), aes(x = title, y =gender.score, fill = gender)) + geom_bar(stat="identity") + coord_flip()
Count of Female Titles:
### dcast the data to convert into matrix of Title, Male and Female
gender.dcast <- dcast(aggregation.gender, title~gender)
## Using gender.score as value column: use value.var to override.
# count of Female shows
length(unique(gender.dcast$title[gender.dcast$Male < gender.dcast$Female]))
## [1] 1524
Count of Male Titles:
# count of male shows
length(unique(gender.dcast$title[gender.dcast$Male > gender.dcast$Female]))
## [1] 0
Count of Negative predictions:
## Print count of negative and positive predictions on all demo
table(sign(predictions$score))
##
## 1
## 18288