Aim to achieve
Our motive to conduct this analysis is to implement the psycological analysis using the ML algorithms instead of traditional scoring system used by psychiatrists.
We are going to build a Linear Regression model and evaluate the dataset to predict the factor values and eliminate the discreteness present in the evaluation.
Import the important libraries and some helpful functions.
library(caret)
library(RCurl)
library(FNN)
library(class)
library(gmodels)
library(splitstackshape)
library(tidyverse)
library(randomForest)
normalize <-function(x){
x <- x/5
x <- format(x, digits = 2, nsmall = 1)
return (x)
}
opn_key <- function(x){
if(is.data.frame(x)){
op <- x[, c(1, 3, 5, 7, 8, 9, 10)]
on <- x[, c(2, 4, 6)]
return((sum(op) - sum(on) + 8)*2.5)
}
op <- x[c(1, 3, 5, 7, 8, 9, 10)]
on <- x[c(2, 4, 6)]
return((sum(op) - sum(on) + 8)*2.5)
}
csn_key <- function(x){
if(is.data.frame(x)){
cp <- x[, c(1, 3, 5, 7, 9, 10)]
cn <- x[, c(2, 4, 6, 8)]
return((sum(cp) - sum(cn) + 14)*2.5)
}
cp <- x[c(1, 3, 5, 7, 9, 10)]
cn <- x[c(2, 4, 6, 8)]
return((sum(cp) - sum(cn) + 14)*2.5)
}
ext_key <- function(x){
if(is.data.frame(x)){
ep <- x[, c(1, 3, 5, 7, 9)]
en <- x[, c(2, 4, 6, 8, 10)]
return((sum(ep) - sum(en) + 20)*2.5)
}
ep <- x[c(1, 3, 5, 7, 9)]
en <- x[c(2, 4, 6, 8, 10)]
return((sum(ep) - sum(en) + 20)*2.5)
}
agr_key <- function(x){
if(is.data.frame(x)){
ap <- x[, c(2, 4, 6, 8, 9, 10)]
an <- x[, c(1, 3, 5, 7)]
return((sum(ap) - sum(an) + 14)*2.5)
}
ap <- x[c(2, 4, 6, 8, 9, 10)]
an <- x[c(1, 3, 5, 7)]
return((sum(ap) - sum(an) + 14)*2.5)
}
neu_key <- function(x){
if(is.data.frame(x)){
np <- x[, c(2, 4)]
nn <- x[, c(1, 3, 5, 6, 7, 8, 9, 10)]
return((sum(np) - sum(nn) + 38)*2.5)
}
np <- x[c(2, 4)]
nn <- x[c(1, 3, 5, 6, 7, 8, 9, 10)]
return((sum(np) - sum(nn) + 38)*2.5)
}
Read dataset
data_set <- read.csv("dataset.csv",
header = TRUE,
nrows = 10000)
# If you are using the Kaggle dataset, then run the code below.
# data_set <- read.csv("final_data.txt", sep = "/t", header = TRUE, nrow = 10000)
# data_set <- data_set[, 50]
Preview of Dataset
As you can observe that our dataset have 50 columns. Each columns represents the responses by users for a specific question. There are 5 different column names that specify which factor the respective question belong to.
head(data_set)
## OPN1 OPN2 OPN3 OPN4 OPN5 OPN6 OPN7 OPN8 OPN9 OPN10 CSN1 CSN2 CSN3 CSN4 CSN5
## 1 5 1 4 1 4 1 5 3 4 5 3 4 3 2 2
## 2 1 2 4 2 3 1 4 2 5 3 3 2 5 3 3
## 3 5 1 2 1 4 2 5 3 4 4 4 2 2 2 3
## 4 4 2 5 2 3 1 4 4 3 3 2 4 4 4 1
## 5 5 1 5 1 5 1 5 3 5 5 5 1 5 1 3
## 6 5 1 5 1 3 1 5 4 5 2 3 2 4 1 3
## CSN6 CSN7 CSN8 CSN9 CSN10 EXT1 EXT2 EXT3 EXT4 EXT5 EXT6 EXT7 EXT8 EXT9 EXT10
## 1 4 4 2 4 4 4 1 5 2 5 1 5 2 4 1
## 2 1 3 3 5 3 3 5 3 4 3 3 2 5 1 5
## 3 3 4 2 4 2 2 3 4 4 3 2 1 3 2 5
## 4 2 2 3 1 4 2 2 2 3 4 2 2 4 1 4
## 5 1 5 1 5 5 3 3 3 3 5 3 3 5 3 4
## 6 2 4 3 4 3 3 3 4 2 4 2 2 3 3 4
## AGR1 AGR2 AGR3 AGR4 AGR5 AGR6 AGR7 AGR8 AGR9 AGR10 NEU1 NEU2 NEU3 NEU4 NEU5
## 1 2 5 2 4 2 3 2 4 3 4 1 4 4 2 2
## 2 1 4 1 5 1 5 3 4 5 3 2 3 4 1 3
## 3 1 4 1 4 2 4 1 4 4 3 4 4 4 2 2
## 4 2 4 3 4 2 4 2 4 3 4 3 3 3 2 3
## 5 1 5 1 5 1 3 1 5 5 3 1 5 5 3 1
## 6 2 3 1 4 2 3 2 3 4 4 3 4 3 2 2
## NEU6 NEU7 NEU8 NEU9 NEU10
## 1 2 2 2 3 2
## 2 1 2 1 3 1
## 3 2 2 2 1 3
## 4 2 2 2 4 3
## 5 1 1 1 3 2
## 6 1 2 1 2 2
Data Preprocessing
Remove NaN, NA and NULL from the dataset
i <- seq(1, 50)
data_set[, i] <- apply(data_set[, i], 2, function(x) as.numeric(as.character(x)))
data_set <- na.omit(data_set)
head(data_set)
## OPN1 OPN2 OPN3 OPN4 OPN5 OPN6 OPN7 OPN8 OPN9 OPN10 CSN1 CSN2 CSN3 CSN4 CSN5
## 1 5 1 4 1 4 1 5 3 4 5 3 4 3 2 2
## 2 1 2 4 2 3 1 4 2 5 3 3 2 5 3 3
## 3 5 1 2 1 4 2 5 3 4 4 4 2 2 2 3
## 4 4 2 5 2 3 1 4 4 3 3 2 4 4 4 1
## 5 5 1 5 1 5 1 5 3 5 5 5 1 5 1 3
## 6 5 1 5 1 3 1 5 4 5 2 3 2 4 1 3
## CSN6 CSN7 CSN8 CSN9 CSN10 EXT1 EXT2 EXT3 EXT4 EXT5 EXT6 EXT7 EXT8 EXT9 EXT10
## 1 4 4 2 4 4 4 1 5 2 5 1 5 2 4 1
## 2 1 3 3 5 3 3 5 3 4 3 3 2 5 1 5
## 3 3 4 2 4 2 2 3 4 4 3 2 1 3 2 5
## 4 2 2 3 1 4 2 2 2 3 4 2 2 4 1 4
## 5 1 5 1 5 5 3 3 3 3 5 3 3 5 3 4
## 6 2 4 3 4 3 3 3 4 2 4 2 2 3 3 4
## AGR1 AGR2 AGR3 AGR4 AGR5 AGR6 AGR7 AGR8 AGR9 AGR10 NEU1 NEU2 NEU3 NEU4 NEU5
## 1 2 5 2 4 2 3 2 4 3 4 1 4 4 2 2
## 2 1 4 1 5 1 5 3 4 5 3 2 3 4 1 3
## 3 1 4 1 4 2 4 1 4 4 3 4 4 4 2 2
## 4 2 4 3 4 2 4 2 4 3 4 3 3 3 2 3
## 5 1 5 1 5 1 3 1 5 5 3 1 5 5 3 1
## 6 2 3 1 4 2 3 2 3 4 4 3 4 3 2 2
## NEU6 NEU7 NEU8 NEU9 NEU10
## 1 2 2 2 3 2
## 2 1 2 1 3 1
## 3 2 2 2 1 3
## 4 2 2 2 4 3
## 5 1 1 1 3 2
## 6 1 2 1 2 2
To reduce computing cost and still generate a robust ML model, we will be forming 5 dataframes from the original dataframe and then removing the duplicate responses for a specific factor.
This will eliminate the possibility of overfitting the model.
Big 5 Personality model is used to predict 5 personality traits:
- Openness
- Conscientiousness
- Extraversion
- Agreeableness
- Neuroticism
opn <- data_set[!duplicated(data_set[, 1:10]), 1:10]
csn <- data_set[!duplicated(data_set[, 11:20]), 11:20]
ext <- data_set[!duplicated(data_set[, 21:30]), 21:30]
agr <- data_set[!duplicated(data_set[, 31:40]), 31:40]
neu <- data_set[!duplicated(data_set[, 41:50]), 41:50]
sample_size <- 10000
opn <- opn[seq(sample_size), ]
csn <- csn[seq(sample_size), ]
ext <- ext[seq(sample_size), ]
agr <- agr[seq(sample_size), ]
neu <- neu[seq(sample_size), ]
Implement scoring key
We are going to form 5 different models, one for each factor in order to achieve higher accuracy for each model.
The key evaluation of the questions are scored in the range from 0 to 40. Convert the score out of 100, for a better understanding for the user and the model.
opn$result[opn$result > 40] <- 40
opn$result[opn$result < 0] <- 0
opn$result <- ceiling(opn$result * 2.5)
csn$result[csn$result > 40] <- 40
csn$result[csn$result < 0] <- 0
csn$result <- ceiling(csn$result * 2.5)
ext$result[ext$result > 40] <- 40
ext$result[ext$result < 0] <- 0
ext$result <- ceiling(ext$result * 2.5)
agr$result[agr$result > 40] <- 40
agr$result[agr$result < 0] <- 0
agr$result <- ceiling(agr$result * 2.5)
neu$result[neu$result > 40] <- 40
neu$result[neu$result < 0] <- 0
neu$result <- ceiling(neu$result * 2.5)
opn <- na.omit(opn)
csn <- na.omit(csn)
ext <- na.omit(ext)
agr <- na.omit(agr)
neu <- na.omit(neu)
Prepare the Machine learning Models
Normalize the dataset
In order to make the training process of the model much better, we will convert all the feature columns in the range of -1 to 1.
opn_a <- as.data.frame(lapply(opn[1:10], normalize))
opn_a$result <- opn$result
csn_a <- as.data.frame(lapply(csn[1:10], normalize))
csn_a$result <- csn$result
ext_a <- as.data.frame(lapply(ext[1:10], normalize))
ext_a$result <- ext$result
agr_a <- as.data.frame(lapply(agr[1:10], normalize))
agr_a$result <- agr$result
neu_a <- as.data.frame(lapply(neu[1:10], normalize))
neu_a$result <- neu$result
Split dataset into training dataset and testing dataset
index_o <- createDataPartition(opn_a$result, p=0.75, list=FALSE)
trainset_o <- opn_a[index_o, ]
testset_o <- opn_a[-index_o, ]
index_c <- createDataPartition(csn_a$result, p=0.75, list=FALSE)
trainset_c <- csn_a[index_c, ]
testset_c <- csn_a[-index_c, ]
index_e <- createDataPartition(ext_a$result, p=0.75, list=FALSE)
trainset_e <- ext_a[index_e, ]
testset_e <- ext_a[-index_e, ]
index_a <- createDataPartition(agr_a$result, p=0.75, list=FALSE)
trainset_a <- agr_a[index_a, ]
testset_a <- agr_a[-index_a, ]
index_n <- createDataPartition(neu_a$result, p=0.75, list=FALSE)
trainset_n <- neu_a[index_n, ]
testset_n <- neu_a[-index_n, ]
Train the Linear Regression Models for each factor
lr_model_opn <- lm(result ~ ., data = trainset_o)
lr_model_csn <- lm(result ~ ., data = trainset_c)
lr_model_ext <- lm(result ~ ., data = trainset_e)
lr_model_agr <- lm(result ~ ., data = trainset_a)
lr_model_neu <- lm(result ~ ., data = trainset_n)
Predicting values for the Testset
opn_result_predicted <- predict(lr_model_opn, testset_o[1:10])
csn_result_predicted <- predict(lr_model_csn, testset_c[1:10])
ext_result_predicted <- predict(lr_model_ext, testset_e[1:10])
agr_result_predicted <- predict(lr_model_agr, testset_a[1:10])
neu_result_predicted <- predict(lr_model_neu, testset_n[1:10])
Calculating the Accuracy of the models
opn_modelPerformance <- data.frame(RMSE = RMSE(opn_result_predicted, testset_o$result),
R2 = R2(opn_result_predicted, testset_o$result))
csn_modelPerformance <- data.frame(RMSE = RMSE(csn_result_predicted, testset_c$result),
R2 = R2(csn_result_predicted, testset_c$result))
ext_modelPerformance <- data.frame(RMSE = RMSE(ext_result_predicted, testset_e$result),
R2 = R2(ext_result_predicted, testset_e$result))
agr_modelPerformance <- data.frame(RMSE = RMSE(agr_result_predicted, testset_a$result),
R2 = R2(agr_result_predicted, testset_a$result))
neu_modelPerformance <- data.frame(RMSE = RMSE(neu_result_predicted, testset_n$result),
R2 = R2(neu_result_predicted, testset_n$result))
k <- c("Openess", "Conscintiousness", "Extraversion", "Agreeableness", "Neuroticism")
cbind(Factors = k, rbind(opn_modelPerformance, csn_modelPerformance, ext_modelPerformance, agr_modelPerformance, neu_modelPerformance))
## Factors RMSE R2
## 1 Openess 0.2513535 0.9997130
## 2 Conscintiousness 0.2549866 0.9997806
## 3 Extraversion 0.2515616 0.9998501
## 4 Agreeableness 0.2509401 0.9997628
## 5 Neuroticism 0.2513456 0.9998294
To save these models for future testing/implementation, use the code below
# saveRDS(model_opn, "model_opn.rds")
# saveRDS(model_csn, "model_csn.rds")
# saveRDS(model_ext, "model_ext.rds")
# saveRDS(model_agr, "model_agr.rds")
# saveRDS(model_neu, "model_neu.rds")
You can also test these models from yourself as well. Below is the list of questions on which these models are based on.
Answer all the questions in range of 1 to 5 and if don’t wish to answer just add 0 for that question.
1 I have a rich vocabulary.
2 I have difficulty understanding abstract ideas.
3 I have a vivid imagination.
4 I am not interested in abstract ideas.
5 I have excellent ideas.
6 I do not have a good imagination.
7 I am quick to understand things.
8 I use difficult words.
9 I spend time reflecting on things.
10 I am full of ideas.
11 I am always prepared.
12 I leave my belongings around.
13 I pay attention to details.
14 I make a mess of things.
15 I get chores done right away.
16 I often forget to put things back in their proper place.
17 I like order.
18 I shirk my duties.
19 I follow a schedule.
20 I am exacting in my work.
21 I am the life of the party.
22 I don’t talk a lot.
23 I feel comfortable around people.
24 I keep in the background.
25 I start conversations.
26 I have little to say.
27 I talk to a lot of different people at parties.
28 I don’t like to draw attention to myself.
29 I don’t mind being the center of attention.
30 I am quiet around strangers.
31 I feel little concern for others.
32 I am interested in people.
33 I insult people.
34 I sympathize with others’ feelings.
35 I am not interested in other people’s problems.
36 I have a soft heart.
37 I am not really interested in others.
38 I take time out for others.
39 I feel others’ emotions.
40 I make people feel at ease.
41 I get stressed out easily.
42 I am relaxed most of the time.
43 I worry about things.
44 I seldom feel blue.
45 I am easily disturbed.
46 I get upset easily.
47 I change my mood a lot.
48 I have frequent mood swings.
49 I get irritated easily.
50 I often feel blue.
Form a vector of 50 values and then implement the code below. For test purpose we have generated 50 random numbers in range of 0 to 5.
k <- sample(x = c(0, 1, 2, 3, 4, 5), size = 50, replace = TRUE)
# Skip the first step and form vector of 50 values
# and name it "k" for perfect evaluation.
k <- rbind(k)
k <- as.data.frame(k)
colnames(k) <- c("OPN1", "OPN2", "OPN3", "OPN4", "OPN5", "OPN6", "OPN7", "OPN8", "OPN9", "OPN10",
"CSN1", "CSN2", "CSN3", "CSN4", "CSN5", "CSN6", "CSN7", "CSN8", "CSN9", "CSN10",
"EXT1", "EXT2", "EXT3", "EXT4", "EXT5", "EXT6", "EXT7", "EXT8", "EXT9", "EXT10",
"AGR1", "AGR2", "AGR3", "AGR4", "AGR5", "AGR6", "AGR7", "AGR8", "AGR9", "AGR10",
"NEU1", "NEU2", "NEU3", "NEU4", "NEU5", "NEU6", "NEU7", "NEU8", "NEU9", "NEU10")
rownames(k) <- 1
s <- k
k <- as.data.frame(lapply(k, normalize))
k <- k[1, ]
opn_result_predicted <- predict(lr_model_opn, k[1:10])
csn_result_predicted <- predict(lr_model_csn, k[11:20])
ext_result_predicted <- predict(lr_model_ext, k[21:30])
agr_result_predicted <- predict(lr_model_agr, k[31:40])
neu_result_predicted <- predict(lr_model_neu, k[41:50])
RMSE <- function (pred, obs, na.rm = FALSE)
sqrt(mean((pred - obs)^2, na.rm = na.rm))
o <- opn_key(s[1:10])
c <- csn_key(s[11:20])
e <- ext_key(s[21:30])
a <- agr_key(s[31:40])
n <- neu_key(s[41:50])
opn_modelPerformance <- data.frame(rmse = RMSE(opn_result_predicted, o))
csn_modelPerformance <- data.frame(rmse = RMSE(csn_result_predicted, c))
ext_modelPerformance <- data.frame(rmse = RMSE(ext_result_predicted, e))
agr_modelPerformance <- data.frame(rmse = RMSE(agr_result_predicted, a))
neu_modelPerformance <- data.frame(rmse = RMSE(neu_result_predicted, n))
k <- c("Openess", "Conscintiousness", "Extraversion", "Agreeableness", "Neuroticism")
cbind(Factors = k,
rbind(opn_modelPerformance, csn_modelPerformance, ext_modelPerformance, agr_modelPerformance, neu_modelPerformance),
Predicted = c(opn_result_predicted, csn_result_predicted, ext_result_predicted, agr_result_predicted, neu_result_predicted),
Calculated = c(opn_key(s[1:10]), csn_key(s[11:20]), ext_key(s[21:30]), agr_key(s[31:40]), neu_key(s[41:50])))
## Factors rmse Predicted Calculated
## 1 Openess 0.3666164 32.86662 32.5
## 2 Conscintiousness 0.1015099 52.60151 52.5
## 3 Extraversion 0.3107114 55.31071 55.0
## 4 Agreeableness 0.3133241 12.81332 12.5
## 5 Neuroticism 0.2431643 50.24316 50.0
Conclusion
Machine Learning could also be implemented to analyse anyone’s personality, just the training is required to do so. And the capabilities of a machine are best shown when it comes to play with numbers, and it is much more preferable over a generalized key for any personality test.