Panel_US Prediction Analysis (scale:Linear [1-100])

This report analyses the demographic prediction results of Boosted Decision Tree Model trained on the Panel_US data after conversion of ratings on linear scale [1-100].

## from the knite WD: C:\Users\Admin\Documents\R\Demo_Data_Prep\
predictions <- read.csv("BDT_US_ScaleConverted.csv")
colnames(predictions)
## [1] "Title"         "Gender"        "Age.Bracket"   "Scored.Labels"
predictions$Scored.Labels <- as.numeric(predictions$Scored.Labels)
class(predictions$Scored.Labels)
## [1] "numeric"
library(ggplot2)

Top 20 rows of the result set:

## Print top 20 rows
head(predictions,12)
##       Title Gender Age.Bracket Scored.Labels
## 1  Suspects   Male     [35-44]      67.56518
## 2  Suspects   Male     [55-64]      60.12343
## 3  Suspects Female     [55-64]      67.56518
## 4  Suspects Female     [35-44]      67.56518
## 5  Suspects   Male     [18-24]      70.52461
## 6  Suspects Female     [18-24]      67.56518
## 7  Suspects   Male     [25-34]      67.56518
## 8  Suspects Female     [25-34]      67.56518
## 9  Suspects   Male       [65+]      65.26256
## 10 Suspects Female       [65+]      69.05964
## 11 Suspects   Male     [45-44]      63.96233
## 12 Suspects Female     [45-44]      73.49471

A summary of the prediction set:

## summarize the data
summary(predictions)
##                          Title          Gender      Age.Bracket  
##  10 Things I Hate About You :   12   Female:9144   [18-24]:3048  
##  100 Deeds For Eddie Mcdowd :   12   Male  :9144   [25-34]:3048  
##  14 Diaries Of The Great War:   12                 [35-44]:3048  
##  1600 Penn                  :   12                 [45-44]:3048  
##  17 Kids And Counting       :   12                 [55-64]:3048  
##  2 Broke Girls              :   12                 [65+]  :3048  
##  (Other)                    :18216                               
##  Scored.Labels  
##  Min.   : 1.01  
##  1st Qu.:54.95  
##  Median :59.92  
##  Mean   :58.79  
##  3rd Qu.:63.90  
##  Max.   :93.06  
## 

Count of Unique Titles in the Test Set:

## count of unique titles in the result set
length(unique(predictions$Title))
## [1] 1524

Histograpm of Predicted Scores:

hist(predictions$Scored.Labels, col=4)

plot of chunk unnamed-chunk-5

The density of the Predicted Scores:

## Check density of scores
d <- density(predictions$Scored.Labels, col=4)
## Warning: In density.default(predictions$Scored.Labels, col = 4) :
##  extra argument 'col' will be disregarded
plot(d)

plot of chunk unnamed-chunk-6

Distributional behaviour of predicted scores:

## Check density of scores
boxplot(predictions$Scored.Labels, col=4)

plot of chunk unnamed-chunk-7

Mean Score of each demo bracket:

## get the mean demo scores for the result set
demo <- aggregate(predictions$Scored.Labels, by = predictions[c('Gender','Age.Bracket')], mean)

## Sort the data by Gender Values
demo[order(demo$Gender),]
##    Gender Age.Bracket        x
## 1  Female     [18-24] 59.03567
## 3  Female     [25-34] 59.03567
## 5  Female     [35-44] 59.03567
## 7  Female     [45-44] 64.96521
## 9  Female     [55-64] 59.03567
## 11 Female       [65+] 60.53011
## 2    Male     [18-24] 61.99509
## 4    Male     [25-34] 59.03567
## 6    Male     [35-44] 59.03567
## 8    Male     [45-44] 55.43282
## 10   Male     [55-64] 51.59392
## 12   Male       [65+] 56.73304

Pie Chart Age brackets by the sum of Scores:

### Create a pie chart of the demo ages by percentage based on prediction scores

# create an aggregate view of age brackets by sum of scores
aggregation.age <- aggregate(predictions$Scored.Labels, by = predictions['Age.Bracket'], sum)

# generate percentages and draw the pie chart
slices <- as.integer(aggregation.age$x)
lbls <- aggregation.age$Age.Bracket
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels 
lbls <- paste(lbls,"%",sep="") # ad % to labels 

pie(slices,labels = lbls, col=rainbow(length(lbls)),main="Age Brackets By Sum of Scores")

plot of chunk unnamed-chunk-9

Bar chart indicating the ranked Age brackets by the sum of scores:

### aggregate the age brackets by the sum of scores
aggregation.age <- aggregation.age[order(-aggregation.age$x),]

# draw Bar chart
bp<- ggplot(aggregation.age, aes(x="", y=x, fill =Age.Bracket))+geom_bar(width = 1, stat = "identity")
bp

plot of chunk unnamed-chunk-10

Pie Chart indicating Gender by the sum of Scores:

### aggregate the gender by the sum of scores
aggregation.gender <- aggregate(predictions$Scored.Labels, by = predictions['Gender'], sum)

### Create a pie chart of the demo ages by percentage based on prediction scores
slices <- as.integer(aggregation.gender$x)
lbls <- aggregation.gender$Gender
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels 
lbls <- paste(lbls,"%",sep="") # ad % to labels 

# draw pie chart
pie(slices,labels = lbls, col = c("violetred1", "blue"),main="Pie Chart of Age Brackets By Sum of Scores")

plot of chunk unnamed-chunk-11

Bar chart indicating the gender by the sum of scores:

# aggregate gender data by sum of scores
aggregation.gender <- aggregation.gender[order(-aggregation.gender$x),]

# rename new column
names(aggregation.gender)[names(aggregation.gender)=="x"] <- "Score_Sum"

# draw Bar chart
bp<- ggplot(aggregation.gender, aes(x=Gender, y=Score_Sum, fill =Gender))+geom_bar(width = 1, stat = "identity")
bp

plot of chunk unnamed-chunk-12

Heat map of Demo by predictions scores on all titles:

library(reshape2)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
# sett the sorting order
predictions$Gender <- factor(predictions$Gender, levels = c("Female", "Male"))
predictions$Age.Bracket <- factor(predictions$Age.Bracket, levels = c("[18-24]","[25-34]","[35-44]","[45-44]","[55-64]","[65+]"))

#sort the data based on gender and age for each title
predictions <- predictions[order(predictions$Title, predictions$Gender,predictions$Age.Bracket),]

### dcast the data to convert into matrix of Title, Demo

demo.data <- dcast(predictions, Title~Gender+Age.Bracket)
## Using Scored.Labels as value column: use value.var to override.
# set titles as row names and get rid of this column 
row.names(demo.data) <- demo.data$Title
dim(demo.data)
## [1] 1524   13
# get the numberic data only 
demo.data <- demo.data[,2:13]

# convert to a numeric matrix
demo_matrix <- data.matrix(demo.data)

# create a data subset for visualization
demo.sub <- head(demo_matrix, 30)

# creates a own color palette from red to green
my_palette <- colorRampPalette(c("green", "red", "yellow"))

heatmap.2(demo.sub, scale = "none",  col=my_palette, trace="none", dendrogram=c("none"), symm=F,symkey=F,symbreaks=T, cexCol=0.7, cexRow=0.6,density.info="histogram",breaks = seq(1, 100), key= T, key.xlab="Predeiction_Score") 

plot of chunk unnamed-chunk-13

Titles By Gender Split:

aggregation.gender <- aggregate(predictions$Scored.Labels, by = predictions[c('Title','Gender')], sum)

names(aggregation.gender)[names(aggregation.gender)=="x"] <- "gender.score"

# order the data by title, gender and score
aggregation.gender <- aggregation.gender[order(aggregation.gender$Title, aggregation.gender$Gender,aggregation.gender$gender.score),]

#plot the stacked bar chart
ggplot(data = head(aggregation.gender,100), aes(x = Title, y =gender.score, fill = Gender)) + geom_bar(stat="identity") + coord_flip()

plot of chunk unnamed-chunk-14

Count of Female Titles:

### dcast the data to convert into matrix of Title, Male and Female
gender.dcast <- dcast(aggregation.gender, Title~Gender)
## Using gender.score as value column: use value.var to override.
# count of Female shows
length(unique(gender.dcast$Title[gender.dcast$Male < gender.dcast$Female]))
## [1] 1524

Count of Male Titles:

# count of male shows
length(unique(gender.dcast$Title[gender.dcast$Male > gender.dcast$Female]))
## [1] 0

Count of Negative predictions:

## Print count of negative and positive predictions on all demo
table(sign(predictions$Scored.Labels))
## 
##     1 
## 18288