Introduction

The Big Five personality traits, also known as the five-factor model (FFM) and the OCEAN model, is a taxonomy, or grouping, for personality traits. When factor analysis (a statistical technique) is applied to personality survey data, some words used to describe aspects of personality are often applied to the same person. For example, someone described as conscientious is more likely to be described as “always prepared” rather than “messy”. This theory is based therefore on the association between words but not on neuropsychological experiments. This theory uses descriptors of common language and therefore suggests five broad dimensions commonly used to describe the human personality and psyche.

This data used for analysis was collected (2016-2018) through an interactive online personality test.

The personality test was constructed with the “Big-Five Factor Traits” from the IPIP. This test is evaluate using the LIKERT SCALE.

https://openpsychometrics.org/tests/IPIP-BFFM/

Aim to achieve

Our motive to conduct this analysis is to implement the psycological analysis using the ML algorithms instead of traditional scoring system used by psychiatrists.

We are going to build a Linear Regression model and evaluate the dataset to predict the factor values and eliminate the discreteness present in the evaluation.

Source of dataset: IPIP DataSet website, Download, Kaggle

Import the important libraries and some helpful functions.

library(caret)
library(RCurl)
library(FNN)
library(class)
library(gmodels)
library(splitstackshape)
library(tidyverse)
library(randomForest)

normalize <-function(x){
    x <- x/5
    x <- format(x, digits = 2, nsmall = 1)
    return (x)
}

opn_key <- function(x){
    if(is.data.frame(x)){
        op <- x[, c(1, 3, 5, 7, 8, 9, 10)]
        on <- x[, c(2, 4, 6)]
        return((sum(op) - sum(on) + 8)*2.5)
    }
    op <- x[c(1, 3, 5, 7, 8, 9, 10)]
    on <- x[c(2, 4, 6)]
    return((sum(op) - sum(on) + 8)*2.5)
}

csn_key <- function(x){
    if(is.data.frame(x)){
        cp <- x[, c(1, 3, 5, 7, 9, 10)]
        cn <- x[, c(2, 4, 6, 8)]
        return((sum(cp) - sum(cn) + 14)*2.5)
    }
    cp <- x[c(1, 3, 5, 7, 9, 10)]
    cn <- x[c(2, 4, 6, 8)]
    return((sum(cp) - sum(cn) + 14)*2.5)
}

ext_key <- function(x){
    if(is.data.frame(x)){
        ep <- x[, c(1, 3, 5, 7, 9)]
        en <- x[, c(2, 4, 6, 8, 10)]
        return((sum(ep) - sum(en) + 20)*2.5)
    }
    ep <- x[c(1, 3, 5, 7, 9)]
    en <- x[c(2, 4, 6, 8, 10)]
    return((sum(ep) - sum(en) + 20)*2.5)
}

agr_key <- function(x){
    if(is.data.frame(x)){
        ap <- x[, c(2, 4, 6, 8, 9, 10)]
        an <- x[, c(1, 3, 5, 7)]
        return((sum(ap) - sum(an) + 14)*2.5)
    }
    ap <- x[c(2, 4, 6, 8, 9, 10)]
    an <- x[c(1, 3, 5, 7)]
    return((sum(ap) - sum(an) + 14)*2.5)
}

neu_key <- function(x){
    if(is.data.frame(x)){
        np <- x[, c(2, 4)]
        nn <- x[, c(1, 3, 5, 6, 7, 8, 9, 10)]
        return((sum(np) - sum(nn) + 38)*2.5)
    }
    np <- x[c(2, 4)]
    nn <- x[c(1, 3, 5, 6, 7, 8, 9, 10)]
    return((sum(np) - sum(nn) + 38)*2.5)
}

Read dataset

data_set <- read.csv("dataset.csv",  
                     header = TRUE, 
                     nrows = 10000)
# If you are using the Kaggle dataset, then run the code below.
# data_set <- read.csv("final_data.txt", sep = "/t", header = TRUE, nrow = 10000)
# data_set <- data_set[, 50]

Preview of Dataset

As you can observe that our dataset have 50 columns. Each columns represents the responses by users for a specific question. There are 5 different column names that specify which factor the respective question belong to.
head(data_set)
##   OPN1 OPN2 OPN3 OPN4 OPN5 OPN6 OPN7 OPN8 OPN9 OPN10 CSN1 CSN2 CSN3 CSN4 CSN5
## 1    5    1    4    1    4    1    5    3    4     5    3    4    3    2    2
## 2    1    2    4    2    3    1    4    2    5     3    3    2    5    3    3
## 3    5    1    2    1    4    2    5    3    4     4    4    2    2    2    3
## 4    4    2    5    2    3    1    4    4    3     3    2    4    4    4    1
## 5    5    1    5    1    5    1    5    3    5     5    5    1    5    1    3
## 6    5    1    5    1    3    1    5    4    5     2    3    2    4    1    3
##   CSN6 CSN7 CSN8 CSN9 CSN10 EXT1 EXT2 EXT3 EXT4 EXT5 EXT6 EXT7 EXT8 EXT9 EXT10
## 1    4    4    2    4     4    4    1    5    2    5    1    5    2    4     1
## 2    1    3    3    5     3    3    5    3    4    3    3    2    5    1     5
## 3    3    4    2    4     2    2    3    4    4    3    2    1    3    2     5
## 4    2    2    3    1     4    2    2    2    3    4    2    2    4    1     4
## 5    1    5    1    5     5    3    3    3    3    5    3    3    5    3     4
## 6    2    4    3    4     3    3    3    4    2    4    2    2    3    3     4
##   AGR1 AGR2 AGR3 AGR4 AGR5 AGR6 AGR7 AGR8 AGR9 AGR10 NEU1 NEU2 NEU3 NEU4 NEU5
## 1    2    5    2    4    2    3    2    4    3     4    1    4    4    2    2
## 2    1    4    1    5    1    5    3    4    5     3    2    3    4    1    3
## 3    1    4    1    4    2    4    1    4    4     3    4    4    4    2    2
## 4    2    4    3    4    2    4    2    4    3     4    3    3    3    2    3
## 5    1    5    1    5    1    3    1    5    5     3    1    5    5    3    1
## 6    2    3    1    4    2    3    2    3    4     4    3    4    3    2    2
##   NEU6 NEU7 NEU8 NEU9 NEU10
## 1    2    2    2    3     2
## 2    1    2    1    3     1
## 3    2    2    2    1     3
## 4    2    2    2    4     3
## 5    1    1    1    3     2
## 6    1    2    1    2     2

Data Preprocessing

Remove NaN, NA and NULL from the dataset

i <- seq(1, 50)
data_set[, i] <- apply(data_set[, i], 2, function(x) as.numeric(as.character(x)))
data_set <- na.omit(data_set)
head(data_set)
##   OPN1 OPN2 OPN3 OPN4 OPN5 OPN6 OPN7 OPN8 OPN9 OPN10 CSN1 CSN2 CSN3 CSN4 CSN5
## 1    5    1    4    1    4    1    5    3    4     5    3    4    3    2    2
## 2    1    2    4    2    3    1    4    2    5     3    3    2    5    3    3
## 3    5    1    2    1    4    2    5    3    4     4    4    2    2    2    3
## 4    4    2    5    2    3    1    4    4    3     3    2    4    4    4    1
## 5    5    1    5    1    5    1    5    3    5     5    5    1    5    1    3
## 6    5    1    5    1    3    1    5    4    5     2    3    2    4    1    3
##   CSN6 CSN7 CSN8 CSN9 CSN10 EXT1 EXT2 EXT3 EXT4 EXT5 EXT6 EXT7 EXT8 EXT9 EXT10
## 1    4    4    2    4     4    4    1    5    2    5    1    5    2    4     1
## 2    1    3    3    5     3    3    5    3    4    3    3    2    5    1     5
## 3    3    4    2    4     2    2    3    4    4    3    2    1    3    2     5
## 4    2    2    3    1     4    2    2    2    3    4    2    2    4    1     4
## 5    1    5    1    5     5    3    3    3    3    5    3    3    5    3     4
## 6    2    4    3    4     3    3    3    4    2    4    2    2    3    3     4
##   AGR1 AGR2 AGR3 AGR4 AGR5 AGR6 AGR7 AGR8 AGR9 AGR10 NEU1 NEU2 NEU3 NEU4 NEU5
## 1    2    5    2    4    2    3    2    4    3     4    1    4    4    2    2
## 2    1    4    1    5    1    5    3    4    5     3    2    3    4    1    3
## 3    1    4    1    4    2    4    1    4    4     3    4    4    4    2    2
## 4    2    4    3    4    2    4    2    4    3     4    3    3    3    2    3
## 5    1    5    1    5    1    3    1    5    5     3    1    5    5    3    1
## 6    2    3    1    4    2    3    2    3    4     4    3    4    3    2    2
##   NEU6 NEU7 NEU8 NEU9 NEU10
## 1    2    2    2    3     2
## 2    1    2    1    3     1
## 3    2    2    2    1     3
## 4    2    2    2    4     3
## 5    1    1    1    3     2
## 6    1    2    1    2     2

To reduce computing cost and still generate a robust ML model, we will be forming 5 dataframes from the original dataframe and then removing the duplicate responses for a specific factor.

This will eliminate the possibility of overfitting the model.

Big 5 Personality model is used to predict 5 personality traits:
  • Openness
  • Conscientiousness
  • Extraversion
  • Agreeableness
  • Neuroticism
opn <- data_set[!duplicated(data_set[, 1:10]), 1:10]
csn <- data_set[!duplicated(data_set[, 11:20]), 11:20]
ext <- data_set[!duplicated(data_set[, 21:30]), 21:30]
agr <- data_set[!duplicated(data_set[, 31:40]), 31:40]
neu <- data_set[!duplicated(data_set[, 41:50]), 41:50]
sample_size <- 10000
opn <- opn[seq(sample_size), ]
csn <- csn[seq(sample_size), ]
ext <- ext[seq(sample_size), ]
agr <- agr[seq(sample_size), ]
neu <- neu[seq(sample_size), ]

Form Positive and Negative dataframes for key evaluation

Each question in the test is either considered +ve valued or -ve valued, and we are going to group all the +ve and -ve questions for each factor in order to evaluate them seperately.
opn_p <- opn[, c(1, 3, 5, 7, 8, 9, 10)]
opn_n <- opn[, c(2, 4, 6)]
csn_p <- csn[, c(1, 3, 5, 7, 9, 10)]
csn_n <- csn[, c(2, 4, 6, 8)]
ext_p <- ext[, c(1, 3, 5, 7, 9)]
ext_n <- ext[, c(2, 4, 6, 8, 10)]
agr_p <- agr[, c(2, 4, 6, 8, 9, 10)]
agr_n <- agr[, c(1, 3, 5, 7)]
neu_p <- neu[, c(2, 4)]
neu_n <- neu[, c(1, 3, 5, 6, 7, 8, 9, 10)]

Implement scoring key

We are going to form 5 different models, one for each factor in order to achieve higher accuracy for each model.
But first we need to form the TARGET columns for each column by evaluating according to the scoring key of the model.
opn$result <- 0
csn$result <- 0
ext$result <- 0
agr$result <- 0
neu$result <- 0
  • Openess
for (i in seq(1:nrow(opn))){
    #Openess
    k <- sum(opn_p[i, ])
    s <- sum(opn_n[i, ])
    opn[i, "result"] = 8 + k - s
}
  • Conscientiousness
for (i in seq(1:nrow(csn))){
    #Conscientiousness
    k <- sum(csn_p[i, ])
    s <- sum(csn_n[i, ])
    csn[i, "result"] = 14 + k - s
}
  • Extraversion
for (i in seq(1:nrow(ext))){
    #Extroversion
    k <- sum(ext_p[i, ])
    s <- sum(ext_n[i, ])
    ext[i, "result"] = 20 + k - s
}
  • Agreeableness
for (i in seq(1:nrow(agr))){
    #Agreeableness
    k <- sum(agr_p[i, ])
    s <- sum(agr_n[i, ])
    agr[i, "result"] = 14 + k - s
}
  • Neuroticism
for (i in seq(1:nrow(neu))){
    #Neuroticism
    k <- sum(neu_p[i, ])
    s <- sum(neu_n[i, ])
    neu[i, "result"] = 38 + k - s
}

The key evaluation of the questions are scored in the range from 0 to 40. Convert the score out of 100, for a better understanding for the user and the model.

opn$result[opn$result > 40] <- 40
opn$result[opn$result < 0] <- 0
opn$result <- ceiling(opn$result * 2.5)

csn$result[csn$result > 40] <- 40
csn$result[csn$result < 0] <- 0
csn$result <- ceiling(csn$result * 2.5)

ext$result[ext$result > 40] <- 40
ext$result[ext$result < 0] <- 0
ext$result <- ceiling(ext$result * 2.5)

agr$result[agr$result > 40] <- 40
agr$result[agr$result < 0] <- 0
agr$result <- ceiling(agr$result * 2.5)

neu$result[neu$result > 40] <- 40
neu$result[neu$result < 0] <- 0
neu$result <- ceiling(neu$result * 2.5)

opn <- na.omit(opn)
csn <- na.omit(csn)
ext <- na.omit(ext)
agr <- na.omit(agr)
neu <- na.omit(neu)

Prepare the Machine learning Models

Normalize the dataset

In order to make the training process of the model much better, we will convert all the feature columns in the range of -1 to 1.
opn_a <- as.data.frame(lapply(opn[1:10], normalize))
opn_a$result <- opn$result
csn_a <- as.data.frame(lapply(csn[1:10], normalize))
csn_a$result <- csn$result
ext_a <- as.data.frame(lapply(ext[1:10], normalize))
ext_a$result <- ext$result
agr_a <- as.data.frame(lapply(agr[1:10], normalize))
agr_a$result <- agr$result
neu_a <- as.data.frame(lapply(neu[1:10], normalize))
neu_a$result <- neu$result

Split dataset into training dataset and testing dataset

index_o <- createDataPartition(opn_a$result, p=0.75, list=FALSE)
trainset_o <- opn_a[index_o, ]
testset_o <- opn_a[-index_o, ]

index_c <- createDataPartition(csn_a$result, p=0.75, list=FALSE)
trainset_c <- csn_a[index_c, ]
testset_c <- csn_a[-index_c, ]

index_e <- createDataPartition(ext_a$result, p=0.75, list=FALSE)
trainset_e <- ext_a[index_e, ]
testset_e <- ext_a[-index_e, ]

index_a <- createDataPartition(agr_a$result, p=0.75, list=FALSE)
trainset_a <- agr_a[index_a, ]
testset_a <- agr_a[-index_a, ]

index_n <- createDataPartition(neu_a$result, p=0.75, list=FALSE)
trainset_n <- neu_a[index_n, ]
testset_n <- neu_a[-index_n, ]

Train the Linear Regression Models for each factor

lr_model_opn <- lm(result ~ ., data = trainset_o)
lr_model_csn <- lm(result ~ ., data = trainset_c)
lr_model_ext <- lm(result ~ ., data = trainset_e)
lr_model_agr <- lm(result ~ ., data = trainset_a)
lr_model_neu <- lm(result ~ ., data = trainset_n)

Predicting values for the Testset

opn_result_predicted <- predict(lr_model_opn, testset_o[1:10])
csn_result_predicted <- predict(lr_model_csn, testset_c[1:10])
ext_result_predicted <- predict(lr_model_ext, testset_e[1:10])
agr_result_predicted <- predict(lr_model_agr, testset_a[1:10])
neu_result_predicted <- predict(lr_model_neu, testset_n[1:10])

Calculating the Accuracy of the models

opn_modelPerformance <- data.frame(RMSE = RMSE(opn_result_predicted, testset_o$result), 
                                   R2 = R2(opn_result_predicted, testset_o$result))
csn_modelPerformance <- data.frame(RMSE = RMSE(csn_result_predicted, testset_c$result), 
                                   R2 = R2(csn_result_predicted, testset_c$result))
ext_modelPerformance <- data.frame(RMSE = RMSE(ext_result_predicted, testset_e$result), 
                                   R2 = R2(ext_result_predicted, testset_e$result))
agr_modelPerformance <- data.frame(RMSE = RMSE(agr_result_predicted, testset_a$result), 
                                   R2 = R2(agr_result_predicted, testset_a$result))
neu_modelPerformance <- data.frame(RMSE = RMSE(neu_result_predicted, testset_n$result), 
                                   R2 = R2(neu_result_predicted, testset_n$result))
k <- c("Openess", "Conscintiousness", "Extraversion", "Agreeableness", "Neuroticism")
cbind(Factors = k, rbind(opn_modelPerformance, csn_modelPerformance, ext_modelPerformance, agr_modelPerformance, neu_modelPerformance))
##            Factors      RMSE        R2
## 1          Openess 0.2513535 0.9997130
## 2 Conscintiousness 0.2549866 0.9997806
## 3     Extraversion 0.2515616 0.9998501
## 4    Agreeableness 0.2509401 0.9997628
## 5      Neuroticism 0.2513456 0.9998294

To save these models for future testing/implementation, use the code below

# saveRDS(model_opn, "model_opn.rds")
# saveRDS(model_csn, "model_csn.rds")
# saveRDS(model_ext, "model_ext.rds")
# saveRDS(model_agr, "model_agr.rds")
# saveRDS(model_neu, "model_neu.rds")

Test the Models at random/custom inputs

You can also test these models from yourself as well. Below is the list of questions on which these models are based on.
Answer all the questions in range of 1 to 5 and if don’t wish to answer just add 0 for that question.
1 I have a rich vocabulary.
2 I have difficulty understanding abstract ideas.
3 I have a vivid imagination.
4 I am not interested in abstract ideas.
5 I have excellent ideas.
6 I do not have a good imagination.
7 I am quick to understand things.
8 I use difficult words.
9 I spend time reflecting on things.
10 I am full of ideas.
11 I am always prepared.
12 I leave my belongings around.
13 I pay attention to details.
14 I make a mess of things.
15 I get chores done right away.
16 I often forget to put things back in their proper place.
17 I like order.
18 I shirk my duties.
19 I follow a schedule.
20 I am exacting in my work.
21 I am the life of the party.
22 I don’t talk a lot.
23 I feel comfortable around people.
24 I keep in the background.
25 I start conversations.
26 I have little to say.
27 I talk to a lot of different people at parties.
28 I don’t like to draw attention to myself.
29 I don’t mind being the center of attention.
30 I am quiet around strangers.
31 I feel little concern for others.
32 I am interested in people.
33 I insult people.
34 I sympathize with others’ feelings.
35 I am not interested in other people’s problems.
36 I have a soft heart.
37 I am not really interested in others.
38 I take time out for others.
39 I feel others’ emotions.
40 I make people feel at ease.
41 I get stressed out easily.
42 I am relaxed most of the time.
43 I worry about things.
44 I seldom feel blue.
45 I am easily disturbed.
46 I get upset easily.
47 I change my mood a lot.
48 I have frequent mood swings.
49 I get irritated easily.
50 I often feel blue.
Form a vector of 50 values and then implement the code below. For test purpose we have generated 50 random numbers in range of 0 to 5.
k <- sample(x = c(0, 1, 2, 3, 4, 5), size = 50, replace = TRUE)
# Skip the first step and form vector of 50 values
# and name it "k" for perfect evaluation.
k <- rbind(k)
k <- as.data.frame(k)
colnames(k) <- c("OPN1", "OPN2", "OPN3", "OPN4", "OPN5", "OPN6", "OPN7", "OPN8", "OPN9", "OPN10", 
                 "CSN1", "CSN2", "CSN3", "CSN4", "CSN5", "CSN6", "CSN7", "CSN8", "CSN9", "CSN10", 
                 "EXT1", "EXT2", "EXT3", "EXT4", "EXT5", "EXT6", "EXT7", "EXT8", "EXT9", "EXT10", 
                 "AGR1", "AGR2", "AGR3", "AGR4", "AGR5", "AGR6", "AGR7", "AGR8", "AGR9", "AGR10", 
                 "NEU1", "NEU2", "NEU3", "NEU4", "NEU5", "NEU6", "NEU7", "NEU8", "NEU9", "NEU10")
rownames(k) <- 1
s <- k
k <- as.data.frame(lapply(k, normalize))
k <- k[1, ]
opn_result_predicted <- predict(lr_model_opn, k[1:10])
csn_result_predicted <- predict(lr_model_csn, k[11:20])
ext_result_predicted <- predict(lr_model_ext, k[21:30])
agr_result_predicted <- predict(lr_model_agr, k[31:40])
neu_result_predicted <- predict(lr_model_neu, k[41:50])
RMSE <- function (pred, obs, na.rm = FALSE)
sqrt(mean((pred - obs)^2, na.rm = na.rm))

o <- opn_key(s[1:10])
c <- csn_key(s[11:20])
e <- ext_key(s[21:30])
a <- agr_key(s[31:40])
n <- neu_key(s[41:50])

opn_modelPerformance <- data.frame(rmse = RMSE(opn_result_predicted, o))
csn_modelPerformance <- data.frame(rmse = RMSE(csn_result_predicted, c))
ext_modelPerformance <- data.frame(rmse = RMSE(ext_result_predicted, e))
agr_modelPerformance <- data.frame(rmse = RMSE(agr_result_predicted, a))
neu_modelPerformance <- data.frame(rmse = RMSE(neu_result_predicted, n))

k <- c("Openess", "Conscintiousness", "Extraversion", "Agreeableness", "Neuroticism")
cbind(Factors = k, 
      rbind(opn_modelPerformance, csn_modelPerformance, ext_modelPerformance, agr_modelPerformance, neu_modelPerformance), 
      Predicted = c(opn_result_predicted, csn_result_predicted, ext_result_predicted, agr_result_predicted, neu_result_predicted), 
      Calculated = c(opn_key(s[1:10]), csn_key(s[11:20]), ext_key(s[21:30]), agr_key(s[31:40]), neu_key(s[41:50])))
##            Factors      rmse Predicted Calculated
## 1          Openess 0.3666164  32.86662       32.5
## 2 Conscintiousness 0.1015099  52.60151       52.5
## 3     Extraversion 0.3107114  55.31071       55.0
## 4    Agreeableness 0.3133241  12.81332       12.5
## 5      Neuroticism 0.2431643  50.24316       50.0

Conclusion

Machine Learning could also be implemented to analyse anyone’s personality, just the training is required to do so. And the capabilities of a machine are best shown when it comes to play with numbers, and it is much more preferable over a generalized key for any personality test.