NR Prediction Analysis

This report analyses the demographic prediction results of Boosted Decision Tree Model trained on the Neilson Rating data. The Train set consists of 103 shows with Neilson ratings on 30 unique demographics. Here is the summary od the result set:

predictions <- read.csv("BDT_Predictions.csv")

Print top 20 rows of the result set:

## Print top 20 rows
head(predictions,20)
##                  Title Gender Age.Bracket Scored.Labels
## 1  24 HOURS IN THE A&E Female     [12-14]      5.160398
## 2  24 HOURS IN THE A&E Female     [15-17]      4.638297
## 3  24 HOURS IN THE A&E Female     [18-20]      5.358187
## 4  24 HOURS IN THE A&E Female       [2-5]      3.049584
## 5  24 HOURS IN THE A&E Female     [21-24]      5.935085
## 6  24 HOURS IN THE A&E Female     [25-29]      6.077176
## 7  24 HOURS IN THE A&E Female     [30-34]      7.110880
## 8  24 HOURS IN THE A&E Female     [35-39]      8.362382
## 9  24 HOURS IN THE A&E Female     [40-44]      7.830174
## 10 24 HOURS IN THE A&E Female     [45-49]      8.342432
## 11 24 HOURS IN THE A&E Female     [50-54]      8.675931
## 12 24 HOURS IN THE A&E Female     [55-64]      9.201477
## 13 24 HOURS IN THE A&E Female       [6-8]      2.941205
## 14 24 HOURS IN THE A&E Female       [65+]      8.221851
## 15 24 HOURS IN THE A&E Female      [9-11]      3.996035
## 16 24 HOURS IN THE A&E   Male     [12-14]      2.639175
## 17 24 HOURS IN THE A&E   Male     [15-17]      1.852015
## 18 24 HOURS IN THE A&E   Male     [18-20]      2.537749
## 19 24 HOURS IN THE A&E   Male       [2-5]      1.782327
## 20 24 HOURS IN THE A&E   Male     [21-24]      2.420793

A summary of the prediction set:

## summarize the data
summary(predictions)
##                          Title          Gender       Age.Bracket   
##  10 THINGS I HATE ABOUT YOU :   30   Female:34905   [12-14]: 4654  
##  11.22.63                   :   30   Male  :34905   [15-17]: 4654  
##  12 MONKEYS                 :   30                  [18-20]: 4654  
##  14 DIARIES OF THE GREAT WAR:   30                  [2-5]  : 4654  
##  16 AND PREGNANT            :   30                  [21-24]: 4654  
##  1600 PENN                  :   30                  [25-29]: 4654  
##  (Other)                    :69630                  (Other):41886  
##  Scored.Labels    
##  Min.   :-0.2748  
##  1st Qu.: 3.1113  
##  Median : 4.3863  
##  Mean   : 4.8097  
##  3rd Qu.: 6.0492  
##  Max.   :20.7719  
## 

Count of Unique Titles in the Test Set:

## count of unique titles in the result set
length(unique(predictions$Title))
## [1] 2327

Histograpm of Predicted Scores:

hist(predictions$Scored.Labels, col=2)

plot of chunk unnamed-chunk-5

The density of the Predicted Scores:

## Check density of scores
d <- density(predictions$Scored.Labels)
plot(d, col=2)

plot of chunk unnamed-chunk-6

Distributional behaviour of predicted scores:

## Check density of scores
boxplot(predictions$Scored.Labels, col=2)

plot of chunk unnamed-chunk-7

Mean Score of each demo bracket:

## get the mean demo scores for the result set
demo <- aggregate(predictions$Scored.Labels, by = predictions[c('Gender','Age.Bracket')], mean)

## Sort the data by a gender
demo[order(demo$Gender),]
##    Gender Age.Bracket        x
## 1  Female     [12-14] 3.796990
## 3  Female     [15-17] 4.182331
## 5  Female     [18-20] 3.991566
## 7  Female       [2-5] 2.611849
## 9  Female     [21-24] 4.623426
## 11 Female     [25-29] 5.140889
## 13 Female     [30-34] 5.745954
## 15 Female     [35-39] 6.152591
## 17 Female     [40-44] 6.302697
## 19 Female     [45-49] 7.228679
## 21 Female     [50-54] 7.840669
## 23 Female     [55-64] 8.531056
## 25 Female       [6-8] 2.847317
## 27 Female       [65+] 7.824185
## 29 Female      [9-11] 3.071794
## 2    Male     [12-14] 3.445694
## 4    Male     [15-17] 3.523230
## 6    Male     [18-20] 3.343731
## 8    Male       [2-5] 2.540694
## 10   Male     [21-24] 3.371728
## 12   Male     [25-29] 3.950273
## 14   Male     [30-34] 4.408101
## 16   Male     [35-39] 4.864605
## 18   Male     [40-44] 5.066209
## 20   Male     [45-49] 5.391658
## 22   Male     [50-54] 6.152144
## 24   Male     [55-64] 6.804319
## 26   Male       [6-8] 2.509104
## 28   Male       [65+] 6.304247
## 30   Male      [9-11] 2.724384

Pie Chart Age brackets by the sum of Scores:

### Create a pie chart of the demo ages by percentage based on prediction scores

# create an aggregate view of age brackets by sum of scores
aggregation.age <- aggregate(predictions$Scored.Labels, by = predictions['Age.Bracket'], sum)

# generate percentages and draw the pie chart
slices <- as.integer(aggregation.age$x)
lbls <- aggregation.age$Age.Bracket
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels 
lbls <- paste(lbls,"%",sep="") # ad % to labels 

pie(slices,labels = lbls, col=rainbow(length(lbls)),main="Age Brackets By Sum of Scores")

plot of chunk unnamed-chunk-9

Bar chart indicating the ranked Age brackets by the sum of scores:

### aggregate the age brackets by the sum of scores
aggregation.age <- aggregation.age[order(-aggregation.age$x),]

# draw Bar chart
bp<- ggplot(aggregation.age, aes(x="", y=x, fill =Age.Bracket))+geom_bar(width = 1, stat = "identity")
## Error in eval(expr, envir, enclos): could not find function "ggplot"
bp
## Error in eval(expr, envir, enclos): object 'bp' not found

Pie Chart indicating Gender by the sum of Scores:

### aggregate the gender by the sum of scores
aggregation.gender <- aggregate(predictions$Scored.Labels, by = predictions['Gender'], sum)

### Create a pie chart of the demo ages by percentage based on prediction scores
slices <- as.integer(aggregation.gender$x)
lbls <- aggregation.gender$Gender
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels 
lbls <- paste(lbls,"%",sep="") # ad % to labels 

# draw pie chart
pie(slices,labels = lbls, col = c("violetred1", "blue"),main="Pie Chart of Age Brackets By Sum of Scores")

plot of chunk unnamed-chunk-11

Bar chart indicating the gender by the sum of scores:

library(ggplot2)
# aggregate gender data by sum of scores
aggregation.gender <- aggregation.gender[order(-aggregation.gender$x),]

# rename new column
names(aggregation.gender)[names(aggregation.gender)=="x"] <- "Score_Sum"

# draw Bar chart
bp<- ggplot(aggregation.gender, aes(x=Gender, y=Score_Sum, fill =Gender))+geom_bar(width = 1, stat = "identity")
bp

plot of chunk unnamed-chunk-12

Heat map of Demo by predictions scores on all titles:

library(reshape2)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
# sett the sorting order
predictions$Gender <- factor(predictions$Gender, levels = c("Female", "Male"))
predictions$Age.Bracket <- factor(predictions$Age.Bracket, levels = c("[2-5]","[6-8]","[9-11]","[12-14]","[15-17]","[18-20]","[21-24]","[25-29]","[30-34]","[35-39]","[40-44]","[45-49]","[50-54]","[55-64]","[65+]"))

#sort the data based on gender and age for each title
predictions <- predictions[order(predictions$Title, predictions$Gender,predictions$Age.Bracket),]

### dcast the data to convert into matrix of Title, Demo

demo.data <- dcast(predictions, Title~Gender+Age.Bracket)
## Using Scored.Labels as value column: use value.var to override.
# set titles as row names and get rid of this column 
row.names(demo.data) <- demo.data$Title
dim(demo.data)
## [1] 2327   31
# get the numberic data only 
demo.data <- demo.data[,2:30]

# convert to a numeric matrix
demo_matrix <- data.matrix(demo.data)

# create a data subset for visualization
demo.sub <- head(demo_matrix, 30)

# creates a own color palette from red to green
my_palette <- colorRampPalette(c("red", "green", "yellow"))

heatmap.2(demo.sub, scale = "none",  col=my_palette, trace="none", dendrogram=c("none"), symm=F,symkey=F,symbreaks=T, cexCol=0.7, cexRow=0.6,density.info="histogram",breaks = seq(-1, 20), key= T, key.xlab="Predeiction_Score") 

plot of chunk unnamed-chunk-13

Titles By Gender Split:

aggregation.gender <- aggregate(predictions$Scored.Labels, by = predictions[c('Title','Gender')], sum)

names(aggregation.gender)[names(aggregation.gender)=="x"] <- "gender.score"

# order the data by title, gender and score
aggregation.gender <- aggregation.gender[order(aggregation.gender$Title, aggregation.gender$Gender,aggregation.gender$gender.score),]

#plot the stacked bar chart
ggplot(data = head(aggregation.gender,100), aes(x = Title, y =gender.score, fill = Gender)) + geom_bar(stat="identity") + coord_flip()

plot of chunk unnamed-chunk-14

Count of Female Titles:

### dcast the data to convert into matrix of Title, Male and Female
gender.dcast <- dcast(aggregation.gender, Title~Gender)
## Using gender.score as value column: use value.var to override.
# count of Female shows
length(unique(gender.dcast$Title[gender.dcast$Male < gender.dcast$Female]))
## [1] 2048

Count of Male Titles:

# count of male shows
length(unique(gender.dcast$Title[gender.dcast$Male > gender.dcast$Female]))
## [1] 279

Count of Negative predictions:

## Print count of negative and positive predictions on all demo
table(sign(predictions$Scored.Labels))
## 
##    -1     1 
##     3 69807