Comparison of Panel_US Train And Test Sets

This report compares the stats of train set against the test sets on US data

DATA LOADING AND SUMMARY STATS:

#### ===================================
###  Train Vs. Test Data Distribution
### ====================================

train = read.csv("C:\\downloads\\Demographics\\New_Demographics\\Regression\\Panel\\US\\class_label_US.csv", header=T)
test = read.csv("C:\\downloads\\Demographics\\New_Demographics\\Regression\\Panel\\US\\Results\\BDT_US.csv", header=T)
library(reshape)

### split the demo column into two based on spaces
train = transform(train, new = colsplit(train$Demo, split = " ", names = c('Gender', 'Age')))
train$Demo <- NULL
train <- setNames(train, c("title","score","gender", "age"))
train <- train[c('title','gender','age','score')]

test = transform(test, new = colsplit(test$Demo, split = " ", names = c('Gender', 'Age')))
test$Demo <- NULL
test$title_id <- NULL
test <- setNames(test, c("title","score","gender", "age"))
test <- test[c('title','gender','age','score')]

names(train)
## [1] "title"  "gender" "age"    "score"
names(test)
## [1] "title"  "gender" "age"    "score"
summary(train)
##                       title         gender          age      
##  AGATHA CHRISTIE'S POIROT:  12   Female:5110   [18-24]:1127  
##  ARROW                   :  12   Male  :4016   [25-34]:2630  
##  BLUE BLOODS             :  12                 [35-44]:2137  
##  BONES                   :  12                 [45-44]:1499  
##  BREAKING BAD            :  12                 [55-64]:1202  
##  CRIMINAL MINDS          :  12                 [65+]  : 531  
##  (Other)                 :9054                               
##      score       
##  Min.   : 1.000  
##  1st Qu.: 8.000  
##  Median : 9.000  
##  Mean   : 8.577  
##  3rd Qu.: 9.750  
##  Max.   :10.000  
## 
summary(test)
##                          title          gender          age      
##  10 Things I Hate About You :   12   Female:9144   [18-24]:3048  
##  100 Deeds For Eddie Mcdowd :   12   Male  :9144   [25-34]:3048  
##  14 Diaries Of The Great War:   12                 [35-44]:3048  
##  1600 Penn                  :   12                 [45-44]:3048  
##  17 Kids And Counting       :   12                 [55-64]:3048  
##  2 Broke Girls              :   12                 [65+]  :3048  
##  (Other)                    :18216                               
##      score      
##  Min.   :6.817  
##  1st Qu.:8.552  
##  Median :8.711  
##  Mean   :8.675  
##  3rd Qu.:8.839  
##  Max.   :9.777  
## 

TITLE AGGREGATION BY COUNT OF DEMOS AVAILABLE IN TRAIN DATA

library(plyr)
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:reshape':
## 
##     rename, round_any
demo.data <- data.frame(train)
demo.data$demo <- paste(demo.data$gender, demo.data$age, sep =" ")
demo.data$gender <- NULL
demo.data$age <- NULL
demo.count <- aggregate(demo.data$demo, by = demo.data['title'], FUN=function(x){length(unique(x))})
names(demo.count)[names(demo.count)=="x"] <- "Demo_Count"
title.count <- aggregate(demo.count$title, by = demo.count['Demo_Count'], FUN=function(x){length(unique(x))})
names(title.count)[names(title.count)=="x"] <- "Title_Count"
title.count
##    Demo_Count Title_Count
## 1           1        1354
## 2           2         539
## 3           3         319
## 4           4         198
## 5           5         134
## 6           6         117
## 7           7          97
## 8           8          86
## 9           9          75
## 10         10          58
## 11         11          45
## 12         12          38

UNIQUE TITLES IN BOTH SETS

## count of unique titles in the train set
length(unique(train$title))
## [1] 3060
## count of unique titles in the test set
length(unique(test$title))
## [1] 1524

DENSITY PLOT

# Filled Density Plot for Train Scores
d <- density(train$score)
plot(d, main="Density of Train Set Scores")
polygon(d, col="red", border="red")

plot of chunk unnamed-chunk-4

# Filled Density Plot for Train Scores
d <- density(test$score)
plot(d, main="Density of Test Set Scores")
polygon(d, col="blue", border="blue")

plot of chunk unnamed-chunk-4

HISTOGRAM

#### histograms
hist(train$score, col=2, main="Train Set")

plot of chunk unnamed-chunk-5

hist(test$score, col=4, main="Test Set")

plot of chunk unnamed-chunk-5

BOXPLOT

###boxplot
boxplot(train$score, col=2, main="Train Set")

plot of chunk unnamed-chunk-6

boxplot(test$score, col=4, main="Test Set")

plot of chunk unnamed-chunk-6

DEMO BRACKETS BY MEAN SCORE

## get the mean demo scores for the result set
demo.train <- aggregate(train$score, by = train[c('gender','age')], mean)
names(demo.train)[names(demo.train)=="x"] <- "Mean_Score"
demo.train
##    gender     age Mean_Score
## 1  Female [18-24]   8.680260
## 2    Male [18-24]   8.858377
## 3  Female [25-34]   8.632253
## 4    Male [25-34]   8.606043
## 5  Female [35-44]   8.576573
## 6    Male [35-44]   8.508624
## 7  Female [45-44]   8.755212
## 8    Male [45-44]   8.457388
## 9  Female [55-64]   8.534187
## 10   Male [55-64]   8.188970
## 11 Female   [65+]   8.643007
## 12   Male   [65+]   8.320476
## get the mean demo scores for the result set
demo.test <- aggregate(test$score, by = test[c('gender','age')], mean)
names(demo.test)[names(demo.test)=="x"] <- "Mean_Score"
demo.test
##    gender     age Mean_Score
## 1  Female [18-24]   8.682935
## 2    Male [18-24]   8.778085
## 3  Female [25-34]   8.682935
## 4    Male [25-34]   8.682935
## 5  Female [35-44]   8.682935
## 6    Male [35-44]   8.682935
## 7  Female [45-44]   8.873578
## 8    Male [45-44]   8.567098
## 9  Female [55-64]   8.682935
## 10   Male [55-64]   8.443671
## 11 Female   [65+]   8.730983
## 12   Male   [65+]   8.608902

PIE CHARTS BY SUM OF SCORES

PIE CHART TRAIN SET

# create an aggregate view of age brackets by sum of scores
aggregation.age.train <- aggregate(train$score, by = train[c('gender','age')], sum)
names(aggregation.age.train)
## [1] "gender" "age"    "x"
# generate percentages and draw the pie chart
slices.train <- as.integer(aggregation.age.train$x)
lbls.train <- aggregation.age.train$age
pct.train <- round(slices.train/sum(slices.train)*100)
lbls.train <- paste(lbls.train, pct.train) # add percents to labels 
lbls.train <- paste(lbls.train,"%",sep="") # ad % to labels 

pie(slices.train,labels = lbls.train, col=rainbow(length(lbls.train)), main="Age Brackets By Sum of Scores in Train Set")

plot of chunk unnamed-chunk-8

PIE CHART TEST SET

# create an aggregate view of age brackets by sum of scores
aggregation.age.test <- aggregate(test$score, by = test[c('gender','age')], sum)
names(aggregation.age.test)
## [1] "gender" "age"    "x"
# generate percentages and draw the pie chart
slices.test <- as.integer(aggregation.age.test$x)
lbls.test <- aggregation.age.test$age
pct.test <- round(slices.test/sum(slices.test)*100)
lbls.test <- paste(lbls.test, pct.test) # add percents to labels 
lbls.test <- paste(lbls.test,"%",sep="") # ad % to labels 

pie(slices.test,labels = lbls.test, col=rainbow(length(lbls.test)), main="Age Brackets By Sum of Scores in test Set")

plot of chunk unnamed-chunk-9

PIE CHARTS BY MEAN OF SCORES

PIE CHART TRAIN SET

# create an aggregate view of age brackets by mean of scores
aggregation.age.train <- aggregate(train$score, by = train[c('gender','age')], mean)
names(aggregation.age.train)
## [1] "gender" "age"    "x"
# generate percentages and draw the pie chart
slices.train <- as.integer(aggregation.age.train$x)
lbls.train <- aggregation.age.train$age
pct.train <- round(slices.train/sum(slices.train)*100)
lbls.train <- paste(lbls.train, pct.train) # add percents to labels 
lbls.train <- paste(lbls.train,"%",sep="") # ad % to labels 

pie(slices.train,labels = lbls.train, col=rainbow(length(lbls.train)), main="Age Brackets By Sum of Scores in Train Set")

plot of chunk unnamed-chunk-10

PIE CHART TEST SET

# create an aggregate view of age brackets by mean of scores
aggregation.age.test <- aggregate(test$score, by = test[c('gender','age')], mean)
names(aggregation.age.test)
## [1] "gender" "age"    "x"
# generate percentages and draw the pie chart
slices.test <- as.integer(aggregation.age.test$x)
lbls.test <- aggregation.age.test$age
pct.test <- round(slices.test/sum(slices.test)*100)
lbls.test <- paste(lbls.test, pct.test) # add percents to labels 
lbls.test <- paste(lbls.test,"%",sep="") # ad % to labels 

pie(slices.test,labels = lbls.test, col=rainbow(length(lbls.test)), main="Age Brackets By Sum of Scores in test Set")

plot of chunk unnamed-chunk-11