This report analyses the demographic prediction results of Boosted Decision Tree Model trained on the Neilson Rating data. The Train set consists of 103 shows with Neilson ratings on 30 unique demographics. Here is the summary od the result set:
predictions <- read.csv("BDT_Predictions.csv")
Print top 20 rows of the result set:
## Print top 20 rows
head(predictions,20)
## Title Gender Age.Bracket Scored.Labels
## 1 24 HOURS IN THE A&E Female [12-14] 5.160398
## 2 24 HOURS IN THE A&E Female [15-17] 4.638297
## 3 24 HOURS IN THE A&E Female [18-20] 5.358187
## 4 24 HOURS IN THE A&E Female [2-5] 3.049584
## 5 24 HOURS IN THE A&E Female [21-24] 5.935085
## 6 24 HOURS IN THE A&E Female [25-29] 6.077176
## 7 24 HOURS IN THE A&E Female [30-34] 7.110880
## 8 24 HOURS IN THE A&E Female [35-39] 8.362382
## 9 24 HOURS IN THE A&E Female [40-44] 7.830174
## 10 24 HOURS IN THE A&E Female [45-49] 8.342432
## 11 24 HOURS IN THE A&E Female [50-54] 8.675931
## 12 24 HOURS IN THE A&E Female [55-64] 9.201477
## 13 24 HOURS IN THE A&E Female [6-8] 2.941205
## 14 24 HOURS IN THE A&E Female [65+] 8.221851
## 15 24 HOURS IN THE A&E Female [9-11] 3.996035
## 16 24 HOURS IN THE A&E Male [12-14] 2.639175
## 17 24 HOURS IN THE A&E Male [15-17] 1.852015
## 18 24 HOURS IN THE A&E Male [18-20] 2.537749
## 19 24 HOURS IN THE A&E Male [2-5] 1.782327
## 20 24 HOURS IN THE A&E Male [21-24] 2.420793
A summary of the prediction set:
## summarize the data
summary(predictions)
## Title Gender Age.Bracket
## 10 THINGS I HATE ABOUT YOU : 30 Female:34905 [12-14]: 4654
## 11.22.63 : 30 Male :34905 [15-17]: 4654
## 12 MONKEYS : 30 [18-20]: 4654
## 14 DIARIES OF THE GREAT WAR: 30 [2-5] : 4654
## 16 AND PREGNANT : 30 [21-24]: 4654
## 1600 PENN : 30 [25-29]: 4654
## (Other) :69630 (Other):41886
## Scored.Labels
## Min. :-0.2748
## 1st Qu.: 3.1113
## Median : 4.3863
## Mean : 4.8097
## 3rd Qu.: 6.0492
## Max. :20.7719
##
Count of Unique Titles in the Test Set:
## count of unique titles in the result set
length(unique(predictions$Title))
## [1] 2327
Histograpm of Predicted Scores:
hist(predictions$Scored.Labels, col=2)
The density of the Predicted Scores:
## Check density of scores
d <- density(predictions$Scored.Labels)
plot(d, col=2)
Distributional behaviour of predicted scores:
## Check density of scores
boxplot(predictions$Scored.Labels, col=2)
Mean Score of each demo bracket:
## get the mean demo scores for the result set
demo <- aggregate(predictions$Scored.Labels, by = predictions[c('Gender','Age.Bracket')], mean)
## Sort the data by a gender
demo[order(demo$Gender),]
## Gender Age.Bracket x
## 1 Female [12-14] 3.796990
## 3 Female [15-17] 4.182331
## 5 Female [18-20] 3.991566
## 7 Female [2-5] 2.611849
## 9 Female [21-24] 4.623426
## 11 Female [25-29] 5.140889
## 13 Female [30-34] 5.745954
## 15 Female [35-39] 6.152591
## 17 Female [40-44] 6.302697
## 19 Female [45-49] 7.228679
## 21 Female [50-54] 7.840669
## 23 Female [55-64] 8.531056
## 25 Female [6-8] 2.847317
## 27 Female [65+] 7.824185
## 29 Female [9-11] 3.071794
## 2 Male [12-14] 3.445694
## 4 Male [15-17] 3.523230
## 6 Male [18-20] 3.343731
## 8 Male [2-5] 2.540694
## 10 Male [21-24] 3.371728
## 12 Male [25-29] 3.950273
## 14 Male [30-34] 4.408101
## 16 Male [35-39] 4.864605
## 18 Male [40-44] 5.066209
## 20 Male [45-49] 5.391658
## 22 Male [50-54] 6.152144
## 24 Male [55-64] 6.804319
## 26 Male [6-8] 2.509104
## 28 Male [65+] 6.304247
## 30 Male [9-11] 2.724384
Pie Chart Age brackets by the sum of Scores:
### Create a pie chart of the demo ages by percentage based on prediction scores
# create an aggregate view of age brackets by sum of scores
aggregation.age <- aggregate(predictions$Scored.Labels, by = predictions['Age.Bracket'], sum)
# generate percentages and draw the pie chart
slices <- as.integer(aggregation.age$x)
lbls <- aggregation.age$Age.Bracket
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels = lbls, col=rainbow(length(lbls)),main="Age Brackets By Sum of Scores")
Bar chart indicating the ranked Age brackets by the sum of scores:
### aggregate the age brackets by the sum of scores
aggregation.age <- aggregation.age[order(-aggregation.age$x),]
# draw Bar chart
bp<- ggplot(aggregation.age, aes(x="", y=x, fill =Age.Bracket))+geom_bar(width = 1, stat = "identity")
## Error in eval(expr, envir, enclos): could not find function "ggplot"
bp
## Error in eval(expr, envir, enclos): object 'bp' not found
Pie Chart indicating Gender by the sum of Scores:
### aggregate the gender by the sum of scores
aggregation.gender <- aggregate(predictions$Scored.Labels, by = predictions['Gender'], sum)
### Create a pie chart of the demo ages by percentage based on prediction scores
slices <- as.integer(aggregation.gender$x)
lbls <- aggregation.gender$Gender
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
# draw pie chart
pie(slices,labels = lbls, col = c("violetred1", "blue"),main="Pie Chart of Age Brackets By Sum of Scores")
Bar chart indicating the gender by the sum of scores:
library(ggplot2)
# aggregate gender data by sum of scores
aggregation.gender <- aggregation.gender[order(-aggregation.gender$x),]
# rename new column
names(aggregation.gender)[names(aggregation.gender)=="x"] <- "Score_Sum"
# draw Bar chart
bp<- ggplot(aggregation.gender, aes(x=Gender, y=Score_Sum, fill =Gender))+geom_bar(width = 1, stat = "identity")
bp
Heat map of Demo by predictions scores on all titles:
library(reshape2)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
# sett the sorting order
predictions$Gender <- factor(predictions$Gender, levels = c("Female", "Male"))
predictions$Age.Bracket <- factor(predictions$Age.Bracket, levels = c("[2-5]","[6-8]","[9-11]","[12-14]","[15-17]","[18-20]","[21-24]","[25-29]","[30-34]","[35-39]","[40-44]","[45-49]","[50-54]","[55-64]","[65+]"))
#sort the data based on gender and age for each title
predictions <- predictions[order(predictions$Title, predictions$Gender,predictions$Age.Bracket),]
### dcast the data to convert into matrix of Title, Demo
demo.data <- dcast(predictions, Title~Gender+Age.Bracket)
## Using Scored.Labels as value column: use value.var to override.
# set titles as row names and get rid of this column
row.names(demo.data) <- demo.data$Title
dim(demo.data)
## [1] 2327 31
# get the numberic data only
demo.data <- demo.data[,2:30]
# convert to a numeric matrix
demo_matrix <- data.matrix(demo.data)
# create a data subset for visualization
demo.sub <- head(demo_matrix, 30)
# creates a own color palette from red to green
my_palette <- colorRampPalette(c("red", "green", "yellow"))
heatmap.2(demo.sub, scale = "none", col=my_palette, trace="none", dendrogram=c("none"), symm=F,symkey=F,symbreaks=T, cexCol=0.7, cexRow=0.6,density.info="histogram",breaks = seq(-1, 20), key= T, key.xlab="Predeiction_Score")
Titles By Gender Split:
aggregation.gender <- aggregate(predictions$Scored.Labels, by = predictions[c('Title','Gender')], sum)
names(aggregation.gender)[names(aggregation.gender)=="x"] <- "gender.score"
# order the data by title, gender and score
aggregation.gender <- aggregation.gender[order(aggregation.gender$Title, aggregation.gender$Gender,aggregation.gender$gender.score),]
#plot the stacked bar chart
ggplot(data = head(aggregation.gender,100), aes(x = Title, y =gender.score, fill = Gender)) + geom_bar(stat="identity") + coord_flip()
Count of Female Titles:
### dcast the data to convert into matrix of Title, Male and Female
gender.dcast <- dcast(aggregation.gender, Title~Gender)
## Using gender.score as value column: use value.var to override.
# count of Female shows
length(unique(gender.dcast$Title[gender.dcast$Male < gender.dcast$Female]))
## [1] 2048
Count of Male Titles:
# count of male shows
length(unique(gender.dcast$Title[gender.dcast$Male > gender.dcast$Female]))
## [1] 279
Count of Negative predictions:
## Print count of negative and positive predictions on all demo
table(sign(predictions$Scored.Labels))
##
## -1 1
## 3 69807