This report compares the stats of train set against the test sets on UK data
DATA LOADING AND SUMMARY STATS:
#### ===================================
### Train Vs. Test Data Distribution
### ====================================
train = read.csv("C:\\downloads\\Demographics\\New_Demographics\\Regression\\Panel\\UK\\class_label_UK.csv", header=T)
test = read.csv("C:\\downloads\\Demographics\\New_Demographics\\Regression\\Panel\\UK\\Results\\BDT_UK.csv", header=T)
library(reshape)
### split the demo column into two based on spaces
train = transform(train, new = colsplit(train$Demo, split = " ", names = c('Gender', 'Age')))
train$Demo <- NULL
train <- setNames(train, c("title","score","gender", "age"))
train <- train[c('title','gender','age','score')]
test = transform(test, new = colsplit(test$Demo, split = " ", names = c('Gender', 'Age')))
test$Demo <- NULL
test$title_id <- NULL
test <- setNames(test, c("title","score","gender", "age"))
test <- test[c('title','gender','age','score')]
names(train)
## [1] "title" "gender" "age" "score"
names(test)
## [1] "title" "gender" "age" "score"
summary(train)
## title gender age
## 24 : 12 Female:2901 [18-24]: 840
## AMERICAN HORROR STORY: 12 Male :2848 [25-34]:1512
## ARROW : 12 [35-44]:1370
## BETTER CALL SAUL : 12 [45-44]:1027
## BLACK SAILS : 12 [55-64]: 693
## BOSCH : 12 [65+] : 307
## (Other) :5677
## score
## Min. : 1.000
## 1st Qu.: 7.500
## Median : 8.333
## Mean : 8.160
## 3rd Qu.: 9.119
## Max. :10.000
##
summary(test)
## title gender age
## 10 Things I Hate About You : 12 Female:10788 [18-24]:3596
## 100 Deeds For Eddie Mcdowd : 12 Male :10788 [25-34]:3596
## 11.22.63 : 12 [35-44]:3596
## 12 Monkeys : 12 [45-44]:3596
## 14 Diaries Of The Great War: 12 [55-64]:3596
## 1600 Penn : 12 [65+] :3596
## (Other) :21504
## score
## Min. :6.502
## 1st Qu.:8.050
## Median :8.281
## Mean :8.263
## 3rd Qu.:8.501
## Max. :9.638
##
UNIQUE TITLES IN BOTH SETS
## count of unique titles in the train set
length(unique(train$title))
## [1] 1706
## count of unique titles in the test set
length(unique(test$title))
## [1] 1798
DENSITY PLOT
# Filled Density Plot for Train Scores
d <- density(train$score)
plot(d, main="Density of Train Set Scores")
polygon(d, col="red", border="red")
# Filled Density Plot for Train Scores
d <- density(test$score)
plot(d, main="Density of Test Set Scores")
polygon(d, col="blue", border="blue")
HISTOGRAM
#### histograms
hist(train$score, col=2, main="Train Set")
hist(test$score, col=4, main="Test Set")
BOXPLOT
###boxplot
boxplot(train$score, col=2, main="Train Set")
boxplot(test$score, col=4, main="Test Set")
DEMO BRACKETS BY MEAN SCORE
## get the mean demo scores for the result set
demo.train <- aggregate(train$score, by = train[c('gender','age')], mean)
names(demo.train)[names(demo.train)=="x"] <- "Mean_Score"
demo.train
## gender age Mean_Score
## 1 Female [18-24] 8.089545
## 2 Male [18-24] 8.025117
## 3 Female [25-34] 8.133394
## 4 Male [25-34] 8.086377
## 5 Female [35-44] 8.245984
## 6 Male [35-44] 8.084273
## 7 Female [45-44] 8.244510
## 8 Male [45-44] 8.176176
## 9 Female [55-64] 8.385293
## 10 Male [55-64] 8.058954
## 11 Female [65+] 8.404800
## 12 Male [65+] 8.373656
## get the mean demo scores for the result set
demo.test <- aggregate(test$score, by = test[c('gender','age')], mean)
names(demo.test)[names(demo.test)=="x"] <- "Mean_Score"
demo.test
## gender age Mean_Score
## 1 Female [18-24] 8.273403
## 2 Male [18-24] 8.151150
## 3 Female [25-34] 8.273403
## 4 Male [25-34] 8.195567
## 5 Female [35-44] 8.324387
## 6 Male [35-44] 8.208744
## 7 Female [45-44] 8.330833
## 8 Male [45-44] 8.307372
## 9 Female [55-64] 8.273403
## 10 Male [55-64] 8.273403
## 11 Female [65+] 8.273403
## 12 Male [65+] 8.273403
PIE CHARTS BY SUM OF SCORES
PIE CHART TRAIN SET
# create an aggregate view of age brackets by sum of scores
aggregation.age.train <- aggregate(train$score, by = train[c('gender','age')], sum)
names(aggregation.age.train)
## [1] "gender" "age" "x"
# generate percentages and draw the pie chart
slices.train <- as.integer(aggregation.age.train$x)
lbls.train <- aggregation.age.train$age
pct.train <- round(slices.train/sum(slices.train)*100)
lbls.train <- paste(lbls.train, pct.train) # add percents to labels
lbls.train <- paste(lbls.train,"%",sep="") # ad % to labels
pie(slices.train,labels = lbls.train, col=rainbow(length(lbls.train)), main="Age Brackets By Sum of Scores in Train Set")
PIE CHART TEST SET
# create an aggregate view of age brackets by sum of scores
aggregation.age.test <- aggregate(test$score, by = test[c('gender','age')], sum)
names(aggregation.age.test)
## [1] "gender" "age" "x"
# generate percentages and draw the pie chart
slices.test <- as.integer(aggregation.age.test$x)
lbls.test <- aggregation.age.test$age
pct.test <- round(slices.test/sum(slices.test)*100)
lbls.test <- paste(lbls.test, pct.test) # add percents to labels
lbls.test <- paste(lbls.test,"%",sep="") # ad % to labels
pie(slices.test,labels = lbls.test, col=rainbow(length(lbls.test)), main="Age Brackets By Sum of Scores in test Set")
PIE CHARTS BY MEAN OF SCORES
PIE CHART TRAIN SET
# create an aggregate view of age brackets by mean of scores
aggregation.age.train <- aggregate(train$score, by = train[c('gender','age')], mean)
names(aggregation.age.train)
## [1] "gender" "age" "x"
# generate percentages and draw the pie chart
slices.train <- as.integer(aggregation.age.train$x)
lbls.train <- aggregation.age.train$age
pct.train <- round(slices.train/sum(slices.train)*100)
lbls.train <- paste(lbls.train, pct.train) # add percents to labels
lbls.train <- paste(lbls.train,"%",sep="") # ad % to labels
pie(slices.train,labels = lbls.train, col=rainbow(length(lbls.train)), main="Age Brackets By Sum of Scores in Train Set")
PIE CHART TEST SET
# create an aggregate view of age brackets by mean of scores
aggregation.age.test <- aggregate(test$score, by = test[c('gender','age')], mean)
names(aggregation.age.test)
## [1] "gender" "age" "x"
# generate percentages and draw the pie chart
slices.test <- as.integer(aggregation.age.test$x)
lbls.test <- aggregation.age.test$age
pct.test <- round(slices.test/sum(slices.test)*100)
lbls.test <- paste(lbls.test, pct.test) # add percents to labels
lbls.test <- paste(lbls.test,"%",sep="") # ad % to labels
pie(slices.test,labels = lbls.test, col=rainbow(length(lbls.test)), main="Age Brackets By Sum of Scores in test Set")