Comparison of Panel_UK Train And Test Sets

This report compares the stats of train set against the test sets on UK data

DATA LOADING AND SUMMARY STATS:

#### ===================================
###  Train Vs. Test Data Distribution
### ====================================

train = read.csv("C:\\downloads\\Demographics\\New_Demographics\\Regression\\Panel\\UK\\class_label_UK.csv", header=T)
test = read.csv("C:\\downloads\\Demographics\\New_Demographics\\Regression\\Panel\\UK\\Results\\BDT_UK.csv", header=T)
library(reshape)

### split the demo column into two based on spaces
train = transform(train, new = colsplit(train$Demo, split = " ", names = c('Gender', 'Age')))
train$Demo <- NULL
train <- setNames(train, c("title","score","gender", "age"))
train <- train[c('title','gender','age','score')]

test = transform(test, new = colsplit(test$Demo, split = " ", names = c('Gender', 'Age')))
test$Demo <- NULL
test$title_id <- NULL
test <- setNames(test, c("title","score","gender", "age"))
test <- test[c('title','gender','age','score')]

names(train)

## [1] "title"  "gender" "age"    "score"

names(test)

## [1] "title"  "gender" "age"    "score"

summary(train)

##                    title         gender          age      
##  24                   :  12   Female:2901   [18-24]: 840  
##  AMERICAN HORROR STORY:  12   Male  :2848   [25-34]:1512  
##  ARROW                :  12                 [35-44]:1370  
##  BETTER CALL SAUL     :  12                 [45-44]:1027  
##  BLACK SAILS          :  12                 [55-64]: 693  
##  BOSCH                :  12                 [65+]  : 307  
##  (Other)              :5677                               
##      score       
##  Min.   : 1.000  
##  1st Qu.: 7.500  
##  Median : 8.333  
##  Mean   : 8.160  
##  3rd Qu.: 9.119  
##  Max.   :10.000  
##

summary(test)

##                          title          gender           age      
##  10 Things I Hate About You :   12   Female:10788   [18-24]:3596  
##  100 Deeds For Eddie Mcdowd :   12   Male  :10788   [25-34]:3596  
##  11.22.63                   :   12                  [35-44]:3596  
##  12 Monkeys                 :   12                  [45-44]:3596  
##  14 Diaries Of The Great War:   12                  [55-64]:3596  
##  1600 Penn                  :   12                  [65+]  :3596  
##  (Other)                    :21504                                
##      score      
##  Min.   :6.502  
##  1st Qu.:8.050  
##  Median :8.281  
##  Mean   :8.263  
##  3rd Qu.:8.501  
##  Max.   :9.638  
##

UNIQUE TITLES IN BOTH SETS

## count of unique titles in the train set
length(unique(train$title))

## [1] 1706

## count of unique titles in the test set
length(unique(test$title))

## [1] 1798

DENSITY PLOT

# Filled Density Plot for Train Scores
d <- density(train$score)
plot(d, main="Density of Train Set Scores")
polygon(d, col="red", border="red")

plot of chunk unnamed-chunk-3

# Filled Density Plot for Train Scores
d <- density(test$score)
plot(d, main="Density of Test Set Scores")
polygon(d, col="blue", border="blue")

plot of chunk unnamed-chunk-3

HISTOGRAM

#### histograms
hist(train$score, col=2, main="Train Set")

plot of chunk unnamed-chunk-4

hist(test$score, col=4, main="Test Set")

plot of chunk unnamed-chunk-4

BOXPLOT

###boxplot
boxplot(train$score, col=2, main="Train Set")

plot of chunk unnamed-chunk-5

boxplot(test$score, col=4, main="Test Set")

plot of chunk unnamed-chunk-5

DEMO BRACKETS BY MEAN SCORE

## get the mean demo scores for the result set
demo.train <- aggregate(train$score, by = train[c('gender','age')], mean)
names(demo.train)[names(demo.train)=="x"] <- "Mean_Score"
demo.train

##    gender     age Mean_Score
## 1  Female [18-24]   8.089545
## 2    Male [18-24]   8.025117
## 3  Female [25-34]   8.133394
## 4    Male [25-34]   8.086377
## 5  Female [35-44]   8.245984
## 6    Male [35-44]   8.084273
## 7  Female [45-44]   8.244510
## 8    Male [45-44]   8.176176
## 9  Female [55-64]   8.385293
## 10   Male [55-64]   8.058954
## 11 Female   [65+]   8.404800
## 12   Male   [65+]   8.373656

## get the mean demo scores for the result set
demo.test <- aggregate(test$score, by = test[c('gender','age')], mean)
names(demo.test)[names(demo.test)=="x"] <- "Mean_Score"
demo.test

##    gender     age Mean_Score
## 1  Female [18-24]   8.273403
## 2    Male [18-24]   8.151150
## 3  Female [25-34]   8.273403
## 4    Male [25-34]   8.195567
## 5  Female [35-44]   8.324387
## 6    Male [35-44]   8.208744
## 7  Female [45-44]   8.330833
## 8    Male [45-44]   8.307372
## 9  Female [55-64]   8.273403
## 10   Male [55-64]   8.273403
## 11 Female   [65+]   8.273403
## 12   Male   [65+]   8.273403

PIE CHARTS BY SUM OF SCORES

PIE CHART TRAIN SET

# create an aggregate view of age brackets by sum of scores
aggregation.age.train <- aggregate(train$score, by = train[c('gender','age')], sum)
names(aggregation.age.train)

## [1] "gender" "age"    "x"

# generate percentages and draw the pie chart
slices.train <- as.integer(aggregation.age.train$x)
lbls.train <- aggregation.age.train$age
pct.train <- round(slices.train/sum(slices.train)*100)
lbls.train <- paste(lbls.train, pct.train) # add percents to labels 
lbls.train <- paste(lbls.train,"%",sep="") # ad % to labels 

pie(slices.train,labels = lbls.train, col=rainbow(length(lbls.train)), main="Age Brackets By Sum of Scores in Train Set")

plot of chunk unnamed-chunk-7

PIE CHART TEST SET

# create an aggregate view of age brackets by sum of scores
aggregation.age.test <- aggregate(test$score, by = test[c('gender','age')], sum)
names(aggregation.age.test)

## [1] "gender" "age"    "x"

# generate percentages and draw the pie chart
slices.test <- as.integer(aggregation.age.test$x)
lbls.test <- aggregation.age.test$age
pct.test <- round(slices.test/sum(slices.test)*100)
lbls.test <- paste(lbls.test, pct.test) # add percents to labels 
lbls.test <- paste(lbls.test,"%",sep="") # ad % to labels 

pie(slices.test,labels = lbls.test, col=rainbow(length(lbls.test)), main="Age Brackets By Sum of Scores in test Set")

plot of chunk unnamed-chunk-8

PIE CHARTS BY MEAN OF SCORES

PIE CHART TRAIN SET

# create an aggregate view of age brackets by mean of scores
aggregation.age.train <- aggregate(train$score, by = train[c('gender','age')], mean)
names(aggregation.age.train)

## [1] "gender" "age"    "x"

# generate percentages and draw the pie chart
slices.train <- as.integer(aggregation.age.train$x)
lbls.train <- aggregation.age.train$age
pct.train <- round(slices.train/sum(slices.train)*100)
lbls.train <- paste(lbls.train, pct.train) # add percents to labels 
lbls.train <- paste(lbls.train,"%",sep="") # ad % to labels 

pie(slices.train,labels = lbls.train, col=rainbow(length(lbls.train)), main="Age Brackets By Sum of Scores in Train Set")

plot of chunk unnamed-chunk-9

PIE CHART TEST SET

# create an aggregate view of age brackets by mean of scores
aggregation.age.test <- aggregate(test$score, by = test[c('gender','age')], mean)
names(aggregation.age.test)

## [1] "gender" "age"    "x"

# generate percentages and draw the pie chart
slices.test <- as.integer(aggregation.age.test$x)
lbls.test <- aggregation.age.test$age
pct.test <- round(slices.test/sum(slices.test)*100)
lbls.test <- paste(lbls.test, pct.test) # add percents to labels 
lbls.test <- paste(lbls.test,"%",sep="") # ad % to labels 

pie(slices.test,labels = lbls.test, col=rainbow(length(lbls.test)), main="Age Brackets By Sum of Scores in test Set")

plot of chunk unnamed-chunk-10