This report compares the stats of train set against the test sets on US data
DATA LOADING AND SUMMARY STATS:
#### ===================================
### Train Vs. Test Data Distribution
### ====================================
train = read.csv("C:\\downloads\\Demographics\\New_Demographics\\Regression\\Panel\\US\\class_label_US.csv", header=T)
test = read.csv("C:\\downloads\\Demographics\\New_Demographics\\Regression\\Panel\\US\\Results\\BDT_US.csv", header=T)
library(reshape)
### split the demo column into two based on spaces
train = transform(train, new = colsplit(train$Demo, split = " ", names = c('Gender', 'Age')))
train$Demo <- NULL
train <- setNames(train, c("title","score","gender", "age"))
train <- train[c('title','gender','age','score')]
test = transform(test, new = colsplit(test$Demo, split = " ", names = c('Gender', 'Age')))
test$Demo <- NULL
test$title_id <- NULL
test <- setNames(test, c("title","score","gender", "age"))
test <- test[c('title','gender','age','score')]
names(train)
## [1] "title" "gender" "age" "score"
names(test)
## [1] "title" "gender" "age" "score"
summary(train)
## title gender age
## AGATHA CHRISTIE'S POIROT: 12 Female:5110 [18-24]:1127
## ARROW : 12 Male :4016 [25-34]:2630
## BLUE BLOODS : 12 [35-44]:2137
## BONES : 12 [45-44]:1499
## BREAKING BAD : 12 [55-64]:1202
## CRIMINAL MINDS : 12 [65+] : 531
## (Other) :9054
## score
## Min. : 1.000
## 1st Qu.: 8.000
## Median : 9.000
## Mean : 8.577
## 3rd Qu.: 9.750
## Max. :10.000
##
summary(test)
## title gender age
## 10 Things I Hate About You : 12 Female:9144 [18-24]:3048
## 100 Deeds For Eddie Mcdowd : 12 Male :9144 [25-34]:3048
## 14 Diaries Of The Great War: 12 [35-44]:3048
## 1600 Penn : 12 [45-44]:3048
## 17 Kids And Counting : 12 [55-64]:3048
## 2 Broke Girls : 12 [65+] :3048
## (Other) :18216
## score
## Min. :6.817
## 1st Qu.:8.552
## Median :8.711
## Mean :8.675
## 3rd Qu.:8.839
## Max. :9.777
##
UNIQUE TITLES IN BOTH SETS
## count of unique titles in the train set
length(unique(train$title))
## [1] 3060
## count of unique titles in the test set
length(unique(test$title))
## [1] 1524
DENSITY PLOT
# Filled Density Plot for Train Scores
d <- density(train$score)
plot(d, main="Density of Train Set Scores")
polygon(d, col="red", border="red")
# Filled Density Plot for Train Scores
d <- density(test$score)
plot(d, main="Density of Test Set Scores")
polygon(d, col="blue", border="blue")
HISTOGRAM
#### histograms
hist(train$score, col=2, main="Train Set")
hist(test$score, col=4, main="Test Set")
BOXPLOT
###boxplot
boxplot(train$score, col=2, main="Train Set")
boxplot(test$score, col=4, main="Test Set")
DEMO BRACKETS BY MEAN SCORE
## get the mean demo scores for the result set
demo.train <- aggregate(train$score, by = train[c('gender','age')], mean)
names(demo.train)[names(demo.train)=="x"] <- "Mean_Score"
demo.train
## gender age Mean_Score
## 1 Female [18-24] 8.680260
## 2 Male [18-24] 8.858377
## 3 Female [25-34] 8.632253
## 4 Male [25-34] 8.606043
## 5 Female [35-44] 8.576573
## 6 Male [35-44] 8.508624
## 7 Female [45-44] 8.755212
## 8 Male [45-44] 8.457388
## 9 Female [55-64] 8.534187
## 10 Male [55-64] 8.188970
## 11 Female [65+] 8.643007
## 12 Male [65+] 8.320476
## get the mean demo scores for the result set
demo.test <- aggregate(test$score, by = test[c('gender','age')], mean)
names(demo.test)[names(demo.test)=="x"] <- "Mean_Score"
demo.test
## gender age Mean_Score
## 1 Female [18-24] 8.682935
## 2 Male [18-24] 8.778085
## 3 Female [25-34] 8.682935
## 4 Male [25-34] 8.682935
## 5 Female [35-44] 8.682935
## 6 Male [35-44] 8.682935
## 7 Female [45-44] 8.873578
## 8 Male [45-44] 8.567098
## 9 Female [55-64] 8.682935
## 10 Male [55-64] 8.443671
## 11 Female [65+] 8.730983
## 12 Male [65+] 8.608902
PIE CHARTS BY SUM OF SCORES
PIE CHART TRAIN SET
# create an aggregate view of age brackets by sum of scores
aggregation.age.train <- aggregate(train$score, by = train[c('gender','age')], sum)
names(aggregation.age.train)
## [1] "gender" "age" "x"
# generate percentages and draw the pie chart
slices.train <- as.integer(aggregation.age.train$x)
lbls.train <- aggregation.age.train$age
pct.train <- round(slices.train/sum(slices.train)*100)
lbls.train <- paste(lbls.train, pct.train) # add percents to labels
lbls.train <- paste(lbls.train,"%",sep="") # ad % to labels
pie(slices.train,labels = lbls.train, col=rainbow(length(lbls.train)), main="Age Brackets By Sum of Scores in Train Set")
PIE CHART TEST SET
# create an aggregate view of age brackets by sum of scores
aggregation.age.test <- aggregate(test$score, by = test[c('gender','age')], sum)
names(aggregation.age.test)
## [1] "gender" "age" "x"
# generate percentages and draw the pie chart
slices.test <- as.integer(aggregation.age.test$x)
lbls.test <- aggregation.age.test$age
pct.test <- round(slices.test/sum(slices.test)*100)
lbls.test <- paste(lbls.test, pct.test) # add percents to labels
lbls.test <- paste(lbls.test,"%",sep="") # ad % to labels
pie(slices.test,labels = lbls.test, col=rainbow(length(lbls.test)), main="Age Brackets By Sum of Scores in test Set")
PIE CHARTS BY MEAN OF SCORES
PIE CHART TRAIN SET
# create an aggregate view of age brackets by mean of scores
aggregation.age.train <- aggregate(train$score, by = train[c('gender','age')], mean)
names(aggregation.age.train)
## [1] "gender" "age" "x"
# generate percentages and draw the pie chart
slices.train <- as.integer(aggregation.age.train$x)
lbls.train <- aggregation.age.train$age
pct.train <- round(slices.train/sum(slices.train)*100)
lbls.train <- paste(lbls.train, pct.train) # add percents to labels
lbls.train <- paste(lbls.train,"%",sep="") # ad % to labels
pie(slices.train,labels = lbls.train, col=rainbow(length(lbls.train)), main="Age Brackets By Sum of Scores in Train Set")
PIE CHART TEST SET
# create an aggregate view of age brackets by mean of scores
aggregation.age.test <- aggregate(test$score, by = test[c('gender','age')], mean)
names(aggregation.age.test)
## [1] "gender" "age" "x"
# generate percentages and draw the pie chart
slices.test <- as.integer(aggregation.age.test$x)
lbls.test <- aggregation.age.test$age
pct.test <- round(slices.test/sum(slices.test)*100)
lbls.test <- paste(lbls.test, pct.test) # add percents to labels
lbls.test <- paste(lbls.test,"%",sep="") # ad % to labels
pie(slices.test,labels = lbls.test, col=rainbow(length(lbls.test)), main="Age Brackets By Sum of Scores in test Set")