This report analyses the demographic prediction results of Boosted Decision Tree Model trained on the Neilson Rating data.
## from the knite WD: C:\Users\Admin\Documents\R\Demo_Data_Prep\R
predictions <- read.csv("BDT_Predictions.csv")
library(ggplot2)
Here is the summary of the result set: Print top 20 rows of the result set:
## Print top 20 rows
head(predictions,30)
## Title Gender Age.Bracket Scored.Labels
## 1 24 HOURS IN THE A&E Female [12-14] 5.160398
## 2 24 HOURS IN THE A&E Female [15-17] 4.638297
## 3 24 HOURS IN THE A&E Female [18-20] 5.358187
## 4 24 HOURS IN THE A&E Female [2-5] 3.049584
## 5 24 HOURS IN THE A&E Female [21-24] 5.935085
## 6 24 HOURS IN THE A&E Female [25-29] 6.077176
## 7 24 HOURS IN THE A&E Female [30-34] 7.110880
## 8 24 HOURS IN THE A&E Female [35-39] 8.362382
## 9 24 HOURS IN THE A&E Female [40-44] 7.830174
## 10 24 HOURS IN THE A&E Female [45-49] 8.342432
## 11 24 HOURS IN THE A&E Female [50-54] 8.675931
## 12 24 HOURS IN THE A&E Female [55-64] 9.201477
## 13 24 HOURS IN THE A&E Female [6-8] 2.941205
## 14 24 HOURS IN THE A&E Female [65+] 8.221851
## 15 24 HOURS IN THE A&E Female [9-11] 3.996035
## 16 24 HOURS IN THE A&E Male [12-14] 2.639175
## 17 24 HOURS IN THE A&E Male [15-17] 1.852015
## 18 24 HOURS IN THE A&E Male [18-20] 2.537749
## 19 24 HOURS IN THE A&E Male [2-5] 1.782327
## 20 24 HOURS IN THE A&E Male [21-24] 2.420793
## 21 24 HOURS IN THE A&E Male [25-29] 2.583813
## 22 24 HOURS IN THE A&E Male [30-34] 3.602087
## 23 24 HOURS IN THE A&E Male [35-39] 5.032439
## 24 24 HOURS IN THE A&E Male [40-44] 4.430747
## 25 24 HOURS IN THE A&E Male [45-49] 4.402406
## 26 24 HOURS IN THE A&E Male [50-54] 4.778188
## 27 24 HOURS IN THE A&E Male [55-64] 5.399017
## 28 24 HOURS IN THE A&E Male [6-8] 1.788482
## 29 24 HOURS IN THE A&E Male [65+] 4.676173
## 30 24 HOURS IN THE A&E Male [9-11] 2.345391
A summary of the prediction set:
## summarize the data
summary(predictions)
## Title Gender Age.Bracket
## 10 THINGS I HATE ABOUT YOU : 30 Female:34905 [12-14]: 4654
## 11.22.63 : 30 Male :34905 [15-17]: 4654
## 12 MONKEYS : 30 [18-20]: 4654
## 14 DIARIES OF THE GREAT WAR: 30 [2-5] : 4654
## 16 AND PREGNANT : 30 [21-24]: 4654
## 1600 PENN : 30 [25-29]: 4654
## (Other) :69630 (Other):41886
## Scored.Labels
## Min. :-0.2748
## 1st Qu.: 3.1113
## Median : 4.3863
## Mean : 4.8097
## 3rd Qu.: 6.0492
## Max. :20.7719
##
Count of Unique Titles in the Test Set:
## count of unique titles in the result set
length(unique(predictions$Title))
## [1] 2327
Histograpm of Predicted Scores:
hist(predictions$Scored.Labels, col=2)
The density of the Predicted Scores:
## Check density of scores
d <- density(predictions$Scored.Labels)
plot(d)
Distributional behaviour of predicted scores:
## Check density of scores
boxplot(predictions$Scored.Labels)
Mean Score of each demo bracket:
## get the mean demo scores for the result set
demo <- aggregate(predictions$Scored.Labels, by = predictions[c('Gender','Age.Bracket')], mean)
## Sort the data by Gender Values
demo[order(demo$Gender),]
## Gender Age.Bracket x
## 1 Female [12-14] 3.796990
## 3 Female [15-17] 4.182331
## 5 Female [18-20] 3.991566
## 7 Female [2-5] 2.611849
## 9 Female [21-24] 4.623426
## 11 Female [25-29] 5.140889
## 13 Female [30-34] 5.745954
## 15 Female [35-39] 6.152591
## 17 Female [40-44] 6.302697
## 19 Female [45-49] 7.228679
## 21 Female [50-54] 7.840669
## 23 Female [55-64] 8.531056
## 25 Female [6-8] 2.847317
## 27 Female [65+] 7.824185
## 29 Female [9-11] 3.071794
## 2 Male [12-14] 3.445694
## 4 Male [15-17] 3.523230
## 6 Male [18-20] 3.343731
## 8 Male [2-5] 2.540694
## 10 Male [21-24] 3.371728
## 12 Male [25-29] 3.950273
## 14 Male [30-34] 4.408101
## 16 Male [35-39] 4.864605
## 18 Male [40-44] 5.066209
## 20 Male [45-49] 5.391658
## 22 Male [50-54] 6.152144
## 24 Male [55-64] 6.804319
## 26 Male [6-8] 2.509104
## 28 Male [65+] 6.304247
## 30 Male [9-11] 2.724384
Pie Chart Age brackets by the sum of Scores:
### Create a pie chart of the demo ages by percentage based on prediction scores
# create an aggregate view of age brackets by sum of scores
aggregation.age <- aggregate(predictions$Scored.Labels, by = predictions['Age.Bracket'], sum)
# generate percentages and draw the pie chart
slices <- as.integer(aggregation.age$x)
lbls <- aggregation.age$Age.Bracket
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels = lbls, col=rainbow(length(lbls)),main="Age Brackets By Sum of Scores")
Bar chart indicating the ranked Age brackets by the sum of scores:
### aggregate the age brackets by the sum of scores
aggregation.age <- aggregation.age[order(-aggregation.age$x),]
# draw Bar chart
bp<- ggplot(aggregation.age, aes(x="", y=x, fill =Age.Bracket))+geom_bar(width = 1, stat = "identity")
bp
Pie Chart indicating Gender by the sum of Scores:
### aggregate the gender by the sum of scores
aggregation.gender <- aggregate(predictions$Scored.Labels, by = predictions['Gender'], sum)
### Create a pie chart of the demo ages by percentage based on prediction scores
slices <- as.integer(aggregation.gender$x)
lbls <- aggregation.gender$Gender
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
# draw pie chart
pie(slices,labels = lbls, col = c("violetred1", "blue"),main="Pie Chart of Age Brackets By Sum of Scores")
Bar chart indicating the gender by the sum of scores:
# aggregate gender data by sum of scores
aggregation.gender <- aggregation.gender[order(-aggregation.gender$x),]
# rename new column
names(aggregation.gender)[names(aggregation.gender)=="x"] <- "Score_Sum"
# draw Bar chart
bp<- ggplot(aggregation.gender, aes(x=Gender, y=Score_Sum, fill =Gender))+geom_bar(width = 1, stat = "identity")
bp
Heat Map of the predicted scores:
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(RColorBrewer)
library(reshape2)
### dcast the data to convert into matrix of Title, Demo
demo.data <- dcast(predictions, Title~Gender+Age.Bracket)
## Using Scored.Labels as value column: use value.var to override.
# set titles as row names and get rid of this column
row.names(demo.data) <- demo.data$Title
# get the numberic data only
demo.data <- demo.data[,2:30]
# convert to a numeric matrix
demo_matrix <- data.matrix(demo.data)
# create a data subset for visualization
demo.sub <- head(demo_matrix, 50)
# creates a own color palette from red to green
my_palette <- colorRampPalette(c("red", "green", "yellow"))
heatmap.2(demo.sub, scale = "none", col=my_palette, trace="none", dendrogram=c("none"), symm=F,symkey=F,symbreaks=T, cexCol=0.5, cexRow=0.5,density.info="histogram",breaks = seq(-1, 20), key= T, key.xlab="Predeiction_Score")
Titles By Gender Split:
aggregation.gender <- aggregate(predictions$Scored.Labels, by = predictions[c('Title','Gender')], sum)
names(aggregation.gender)[names(aggregation.gender)=="x"] <- "gender.score"
# order the data by title, gender and score
aggregation.gender <- aggregation.gender[order(aggregation.gender$Title, aggregation.gender$Gender,aggregation.gender$gender.score),]
#plot the stacked bar chart
ggplot(data = head(aggregation.gender,100), aes(x = Title, y =gender.score, fill = Gender)) + geom_bar(stat="identity") + coord_flip()
Count of Female Titles:
### dcast the data to convert into matrix of Title, Male and Female
gender.dcast <- dcast(aggregation.gender, Title~Gender)
## Using gender.score as value column: use value.var to override.
# count of Female shows
length(unique(gender.dcast$Title[gender.dcast$Male < gender.dcast$Female]))
## [1] 2048
Count of Male Titles:
### dcast the data to convert into matrix of Title, Male and Female
gender.dcast <- dcast(aggregation.gender, Title~Gender)
## Using gender.score as value column: use value.var to override.
# count of male shows
length(unique(gender.dcast$Title[gender.dcast$Male > gender.dcast$Female]))
## [1] 279
Count of Negative predictions:
## from the knite WD: C:\Users\Admin\Documents\R\Demo_Data_Prep\R
predictions <- read.csv("BDT_Predictions.csv")
## Print count of negative and positive predictions on all demo
table(sign(predictions$Scored.Labels))
##
## -1 1
## 3 69807