Big Five Personality Project

Introduction

Our group analyzed data from the BIG5 library which consisted of responses from 19,719 participants who took the Big Five Personality Inventory, a self-report questionnaire that measures the Big Five personality traits: openness, conscientiousness, extraversion, agreeableness, and neuroticism. The data we viewed categorized other information about the participants such as race, age, gender, native language, hand dominance, and even format of test taken. We had the initial idea to investigate variances of these personality traits across different ethnicities.

Data Cleaning

To begin, we loaded and viewed our data set.

data <- read_tsv("data.csv")

head(data[1:22]) %>%
  kbl() %>%
  kable_styling(c("striped", "hover"))

race	age	engnat	gender	hand	source	country	E1	E2	E3	E4	E5	E6	E7	E8	E9	E10	N1	N2	N3	N4	N5
3	53	1	1	1	1	US	4	2	5	2	5	1	4	3	5	1	1	5	2	5	1
13	46	1	2	1	1	US	2	2	3	3	3	3	1	5	1	5	2	3	4	2	3
1	14	2	2	1	1	PK	5	1	1	4	5	1	1	5	5	1	5	1	5	5	5
3	19	2	2	1	1	RO	2	5	2	4	3	4	3	4	4	5	5	4	4	2	4
11	25	2	2	1	2	US	3	1	3	3	3	1	3	1	3	5	3	3	3	4	3
13	31	1	2	1	2	US	1	5	2	4	1	3	2	4	1	5	1	5	4	5	1

head(data[23:44]) %>%
  kbl() %>%
  kable_styling(c("striped", "hover"))

N6	N7	N8	N9	N10	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	C1	C2	C3	C4	C5	C6	C7
1	1	1	1	1	1	5	1	5	2	3	1	5	4	5	4	1	5	1	5	1	4
4	3	2	2	4	1	3	3	4	4	4	2	3	4	3	4	1	3	2	3	1	5
5	5	5	5	5	5	1	5	5	1	5	1	5	5	5	4	1	5	1	5	1	5
5	5	5	4	5	2	5	4	4	3	5	3	4	4	3	3	3	4	5	1	4	5
3	3	3	3	4	5	5	3	5	1	5	1	5	5	5	3	1	5	3	3	1	1
4	4	1	5	2	2	2	3	4	3	4	3	5	5	3	2	5	4	3	3	4	5

head(data[45:57]) %>%
  kbl() %>%
  kable_styling(c("striped", "hover"))

C8	C9	C10	O1	O2	O3	O4	O5	O6	O7	O8	O9	O10
1	4	5	4	1	3	1	5	1	4	2	5	5
1	4	4	3	3	3	3	2	3	3	1	3	2
1	5	5	4	5	5	1	5	1	5	5	5	5
4	2	3	4	3	5	2	4	2	5	2	5	5
3	3	3	3	1	1	1	3	1	3	1	5	3
3	5	3	4	2	1	3	3	5	5	4	5	3

Noticing that the dataset contained a column per question of the survey, we aimed to condense these columns down to five primary score columns—one per personality trait assessed by the Big Five. This first required converting any question items that were reverse-keyed so that they would have a normally-keyed score.

to_positive <- function(score) {
  positive_score = -score %% 6
  return(positive_score)
}

Here we identify reverse-keyed items and store them in a vector:

negative_questions <- c("E2", "E4", "E6", "E8", "E10", "N1", "N3", "N5", "N6", "N7", "N8", "N9", "N10", "A1", "A3", "A5", "A7", "C2", "C4", "C6", "C8", "O2", "O4", "O6")

In order to avoid problems related to missing values (because this dataset presents missing values as “0”), we replaced all 0s with NAs.

data <- na_if(data, 0)

Next we converted negative scores to positive scores and replaced the old columns in the data with the updated columns.

for (question in negative_questions)
{
  data[question] <- to_positive(data[question])
}

We then summed the corresponding columns and made new total score columns.

data %>%
  transmute(Extraversion = rowSums(data[, c(8:17)]),
            Neuroticism = rowSums(data[, c(18:27)]),
            Agreeableness = rowSums(data[, c(28:37)]),
            Conscientiousness = rowSums(data[, c(38:47)]),
            Openness = rowSums(data[, c(48:57)])) -> totals

Next, we combined the new total score columns with the original demographic data included in the dataset.

data <- cbind(data[, c(1:7)], totals)

Here is our updated data set with the total score columns instead of individual values for each question item:

head(data) %>%
  kbl() %>%
  kable_styling(c("striped", "hover"))

race	age	engnat	gender	hand	source	country	Extraversion	Neuroticism	Agreeableness	Conscientiousness	Openness
3	53	1	1	1	1	US	44	49	46	47	43
13	46	1	2	1	1	US	22	29	35	42	26
1	14	2	2	1	1	PK	35	14	38	49	45
3	19	2	2	1	1	RO	22	17	37	26	41
11	25	2	2	1	2	US	34	30	44	34	34
13	31	1	2	1	2	US	16	36	36	31	33

We then realized that the data for race was defaulting to continuous data instead of discrete data, which was causing problems with our analysis. Therefore, we decided to recode the data as discrete variables by converting the numbers assigned to each race value to character strings.

race_converter <- function(race_code) {
  if (race_code == 1) {
    return("Mixed")
  }
  else if (race_code == 2) {
    return("Arctic")
  }
  else if (race_code == 3) {
    return("European Caucasian")
  }
  else if (race_code == 4) {
    return("Indian Caucasian")
  }
  else if (race_code == 5) {
    return("Middle Eastern Caucasian")
  }
  else if (race_code == 6) {
    return("North African/Other Caucasian")
  }
  else if (race_code == 7) {
    return("Indigenous Australian")
  }
  else if (race_code == 8) {
    return("Native American")
  }
  else if (race_code == 9) {
    return("North East Asian")
  }
  else if (race_code == 10) {
    return("Pacific Islander")
  }
  else if (race_code == 11) {
    return("South East Asian")
  }
  else if (race_code == 12) {
    return("West African, Bushmen, Ethiopian")
  }
  else if (race_code == 13) {
    return("Other")
  }
  else {
    return(NA)
  }
}

Next we removed any rows from the data which had an NA for race.

no_nas_race <- drop_na(data, race)

We then converted the old, continuous race data to the new, discrete race data.

no_nas_race["race"] <- apply(no_nas_race["race"], 1, race_converter)

We realized that the data for gender was also defaulting to continuous data instead of discrete data, which was causing problems with our analysis as well. Therefore, we decided to recode this data as discrete variables by converting the numbers assigned to each race value to character strings in the same way as we recoded the race data.

  gender_converter <- function(gender_code) {
    if (gender_code == 1) {
      return("Male")
    }
    else if (gender_code == 2) {
      return("Female")
    }
    else if (gender_code == 3) {
      return("Other")
    }
    else {
      return(NA)
    }
  }

Next we removed any rows from the data which had an NA for gender.

no_nas_gender <- drop_na(data, gender)

We then converted the old, continuous gender data to the new, discrete gender data.

no_nas_gender["gender"] <- apply(no_nas_gender["gender"], 1, gender_converter)

We also removed any rows which contained an NA for age.

no_nas_age <- drop_na(data, age)

We then limited the age range to less than 90 to remove any false information:

no_nas_age <- filter(no_nas_age, age < 90)

We didn’t want any NA’s to show up on our final table, so we dropped any that may have been present in the data.

data <- drop_na(data, gender, race)
data["gender"] <- apply(data["gender"], 1, gender_converter)
data["race"] <- apply(data["race"], 1, race_converter)
data <- subset(data, select = -c(engnat, hand, source))

At this point we had finished cleaning our data, so we looked at our cleaned and updated data frame:

head(data) %>%
  kbl() %>%
  kable_styling(c("striped", "hover"))

race	age	gender	country	Extraversion	Neuroticism	Agreeableness	Conscientiousness	Openness
European Caucasian	53	Male	US	44	49	46	47	43
Other	46	Female	US	22	29	35	42	26
Mixed	14	Female	PK	35	14	38	49	45
European Caucasian	19	Female	RO	22	17	37	26	41
South East Asian	25	Female	US	34	30	44	34	34
Other	31	Female	US	16	36	36	31	33

Data Visualization and Analysis

To visualize the data, we set up subplots and arranged them into a single figure.

NOTE: Because this dataset was collected cross-sectionally and not longitudinally, the trends displayed in these graphs are those of Big Five trait scores of different people of different age groups. Therefore, these graphs do not show average change in Big Five personality traits over people’s lifetimes bur rather show averages of Big Five personality traits in people of different ages at the same time.
NOTE: We wanted to change the axes so that their units were all the same so that the graphs would be clearer and easier to interpret, but we couldn’t figure out how.

plot1 <- ggplot(no_nas_age) + geom_smooth(mapping = aes(x = age, y = Extraversion))
plot2 <- ggplot(no_nas_age) + geom_smooth(mapping = aes(x = age, y = Openness))
plot3 <- ggplot(no_nas_age) + geom_smooth(mapping = aes(x = age, y = Conscientiousness))
plot4 <- ggplot(no_nas_age) + geom_smooth(mapping = aes(x = age, y = Agreeableness))
plot5 <- ggplot(no_nas_age) + geom_smooth(mapping = aes(x = age, y = Neuroticism))

ggarrange(plot1, plot2, plot3, plot4, plot5, labels = c("Extraversion", "Openness", "Conscientiousness", "Agreeableness", "Neuroticism"), ncol=1, nrow=5)

We then set up a correlation matrix to examine the correlations between ALL of the different possible correlations between all of the Big Five personality traits (i.e., correlation between Extraversion and Neuroticism, correlation between Extraversion and Agreeableness, etc.).

source("http://www.sthda.com/upload/rquery_cormat.r")
cormat <- rquery.cormat(totals, type="full")

Seeing possible statistically significant correlations, we viewed a table of the specific correlation values.

cormat$r %>%
  kbl(digits = 2) %>%
  kable_styling(c("striped", "hover"))

	Neuroticism	Conscientiousness	Openness	Extraversion	Agreeableness
Neuroticism	1.00	0.26	0.10	0.26	0.11
Conscientiousness	0.26	1.00	0.09	0.11	0.18
Openness	0.10	0.09	1.00	0.17	0.12
Extraversion	0.26	0.11	0.17	1.00	0.33
Agreeableness	0.11	0.18	0.12	0.33	1.00

We also viewed the p-values for the correlations to assess statistical significance.

cormat$p %>%
  kbl(digits = 6000) %>%
  kable_styling(c("striped", "hover"))

	Neuroticism	Conscientiousness	Openness	Extraversion	Agreeableness
Neuroticism	0.0e+00	1.7e-301	8.7e-46	5.4e-308	4.8e-56
Conscientiousness	1.7e-301	0.0e+00	1.0e-33	1.5e-50	1.1e-136
Openness	8.7e-46	1.0e-33	0.0e+00	5.4e-122	3.5e-60
Extraversion	5.4e-308	1.5e-50	5.4e-122	0.0e+00	0.0e+00
Agreeableness	4.8e-56	1.1e-136	3.5e-60	0.0e+00	0.0e+00

Conclusion

Based on our analyses, we found that all of the big five traits tended to decline around the age of 60, with the exception of agreeableness. However, these differences could reflect cohort differences or a small sample size rather than changes from aging, as this was a cross-sectional study rather than a longitudinal one. The declines are all small, around 2 points on average, and we did not run significance testing so it is impossible to tell how reliable these are as representative of real-world patterns.

While running correlation data, we found that all of the values displayed positive correlations with each other. We are not entirely sure why this happened, and wonder whether it reflects actual observations or if we have made an error when cleaning our data that resulted in the data to only show positive correlations. We posit that this may be caused by re-coding the answers to positive values, although we are unsure why that would have occurred.

Big Five Personality Project

Omar Avila, Aden Eagle, Kiana Slimak, Tristan Stinchcomb, Isabella Vallejo, and Hannah Vinik

3/9/2021

Introduction

Data Cleaning

To begin, we loaded and viewed our data set.

Here we identify reverse-keyed items and store them in a vector:

Next we converted negative scores to positive scores and replaced the old columns in the data with the updated columns.

We then summed the corresponding columns and made new total score columns.

Next, we combined the new total score columns with the original demographic data included in the dataset.

Here is our updated data set with the total score columns instead of individual values for each question item:

We then realized that the data for race was defaulting to continuous data instead of discrete data, which was causing problems with our analysis. Therefore, we decided to recode the data as discrete variables by converting the numbers assigned to each race value to character strings.

Next we removed any rows from the data which had an NA for race.

We then converted the old, continuous race data to the new, discrete race data.

Next we removed any rows from the data which had an NA for gender.

We then converted the old, continuous gender data to the new, discrete gender data.

We also removed any rows which contained an NA for age.

We then limited the age range to less than 90 to remove any false information:

We didn’t want any NA’s to show up on our final table, so we dropped any that may have been present in the data.

At this point we had finished cleaning our data, so we looked at our cleaned and updated data frame:

Data Visualization and Analysis

To visualize the data, we set up subplots and arranged them into a single figure.

We then set up a correlation matrix to examine the correlations between ALL of the different possible correlations between all of the Big Five personality traits (i.e., correlation between Extraversion and Neuroticism, correlation between Extraversion and Agreeableness, etc.).

Seeing possible statistically significant correlations, we viewed a table of the specific correlation values.

We also viewed the p-values for the correlations to assess statistical significance.

Conclusion

race	age	engnat	gender	hand	source	country	E1	E2	E3	E4	E5	E6	E7	E8	E9	E10	N1	N2	N3	N4	N5
3	53	1	1	1	1	US	4	2	5	2	5	1	4	3	5	1	1	5	2	5	1
13	46	1	2	1	1	US	2	2	3	3	3	3	1	5	1	5	2	3	4	2	3
1	14	2	2	1	1	PK	5	1	1	4	5	1	1	5	5	1	5	1	5	5	5
3	19	2	2	1	1	RO	2	5	2	4	3	4	3	4	4	5	5	4	4	2	4
11	25	2	2	1	2	US	3	1	3	3	3	1	3	1	3	5	3	3	3	4	3
13	31	1	2	1	2	US	1	5	2	4	1	3	2	4	1	5	1	5	4	5	1

N6	N7	N8	N9	N10	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	C1	C2	C3	C4	C5	C6	C7
1	1	1	1	1	1	5	1	5	2	3	1	5	4	5	4	1	5	1	5	1	4
4	3	2	2	4	1	3	3	4	4	4	2	3	4	3	4	1	3	2	3	1	5
5	5	5	5	5	5	1	5	5	1	5	1	5	5	5	4	1	5	1	5	1	5
5	5	5	4	5	2	5	4	4	3	5	3	4	4	3	3	3	4	5	1	4	5
3	3	3	3	4	5	5	3	5	1	5	1	5	5	5	3	1	5	3	3	1	1
4	4	1	5	2	2	2	3	4	3	4	3	5	5	3	2	5	4	3	3	4	5

C8	C9	C10	O1	O2	O3	O4	O5	O6	O7	O8	O9	O10
1	4	5	4	1	3	1	5	1	4	2	5	5
1	4	4	3	3	3	3	2	3	3	1	3	2
1	5	5	4	5	5	1	5	1	5	5	5	5
4	2	3	4	3	5	2	4	2	5	2	5	5
3	3	3	3	1	1	1	3	1	3	1	5	3
3	5	3	4	2	1	3	3	5	5	4	5	3

race	age	engnat	gender	hand	source	country	Extraversion	Neuroticism	Agreeableness	Conscientiousness	Openness
3	53	1	1	1	1	US	44	49	46	47	43
13	46	1	2	1	1	US	22	29	35	42	26
1	14	2	2	1	1	PK	35	14	38	49	45
3	19	2	2	1	1	RO	22	17	37	26	41
11	25	2	2	1	2	US	34	30	44	34	34
13	31	1	2	1	2	US	16	36	36	31	33

race	age	engnat	gender	hand	source	country	E1	E2	E3	E4	E5	E6	E7	E8	E9	E10	N1	N2	N3	N4	N5
3	53	1	1	1	1	US	4	2	5	2	5	1	4	3	5	1	1	5	2	5	1
13	46	1	2	1	1	US	2	2	3	3	3	3	1	5	1	5	2	3	4	2	3
1	14	2	2	1	1	PK	5	1	1	4	5	1	1	5	5	1	5	1	5	5	5
3	19	2	2	1	1	RO	2	5	2	4	3	4	3	4	4	5	5	4	4	2	4
11	25	2	2	1	2	US	3	1	3	3	3	1	3	1	3	5	3	3	3	4	3
13	31	1	2	1	2	US	1	5	2	4	1	3	2	4	1	5	1	5	4	5	1

N6	N7	N8	N9	N10	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	C1	C2	C3	C4	C5	C6	C7
1	1	1	1	1	1	5	1	5	2	3	1	5	4	5	4	1	5	1	5	1	4
4	3	2	2	4	1	3	3	4	4	4	2	3	4	3	4	1	3	2	3	1	5
5	5	5	5	5	5	1	5	5	1	5	1	5	5	5	4	1	5	1	5	1	5
5	5	5	4	5	2	5	4	4	3	5	3	4	4	3	3	3	4	5	1	4	5
3	3	3	3	4	5	5	3	5	1	5	1	5	5	5	3	1	5	3	3	1	1
4	4	1	5	2	2	2	3	4	3	4	3	5	5	3	2	5	4	3	3	4	5

C8	C9	C10	O1	O2	O3	O4	O5	O6	O7	O8	O9	O10
1	4	5	4	1	3	1	5	1	4	2	5	5
1	4	4	3	3	3	3	2	3	3	1	3	2
1	5	5	4	5	5	1	5	1	5	5	5	5
4	2	3	4	3	5	2	4	2	5	2	5	5
3	3	3	3	1	1	1	3	1	3	1	5	3
3	5	3	4	2	1	3	3	5	5	4	5	3

race	age	engnat	gender	hand	source	country	Extraversion	Neuroticism	Agreeableness	Conscientiousness	Openness
3	53	1	1	1	1	US	44	49	46	47	43
13	46	1	2	1	1	US	22	29	35	42	26
1	14	2	2	1	1	PK	35	14	38	49	45
3	19	2	2	1	1	RO	22	17	37	26	41
11	25	2	2	1	2	US	34	30	44	34	34
13	31	1	2	1	2	US	16	36	36	31	33

race	age	engnat	gender	hand	source	country	E1	E2	E3	E4	E5	E6	E7	E8	E9	E10	N1	N2	N3	N4	N5
3	53	1	1	1	1	US	4	2	5	2	5	1	4	3	5	1	1	5	2	5	1
13	46	1	2	1	1	US	2	2	3	3	3	3	1	5	1	5	2	3	4	2	3
1	14	2	2	1	1	PK	5	1	1	4	5	1	1	5	5	1	5	1	5	5	5
3	19	2	2	1	1	RO	2	5	2	4	3	4	3	4	4	5	5	4	4	2	4
11	25	2	2	1	2	US	3	1	3	3	3	1	3	1	3	5	3	3	3	4	3
13	31	1	2	1	2	US	1	5	2	4	1	3	2	4	1	5	1	5	4	5	1

N6	N7	N8	N9	N10	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	C1	C2	C3	C4	C5	C6	C7
1	1	1	1	1	1	5	1	5	2	3	1	5	4	5	4	1	5	1	5	1	4
4	3	2	2	4	1	3	3	4	4	4	2	3	4	3	4	1	3	2	3	1	5
5	5	5	5	5	5	1	5	5	1	5	1	5	5	5	4	1	5	1	5	1	5
5	5	5	4	5	2	5	4	4	3	5	3	4	4	3	3	3	4	5	1	4	5
3	3	3	3	4	5	5	3	5	1	5	1	5	5	5	3	1	5	3	3	1	1
4	4	1	5	2	2	2	3	4	3	4	3	5	5	3	2	5	4	3	3	4	5

C8	C9	C10	O1	O2	O3	O4	O5	O6	O7	O8	O9	O10
1	4	5	4	1	3	1	5	1	4	2	5	5
1	4	4	3	3	3	3	2	3	3	1	3	2
1	5	5	4	5	5	1	5	1	5	5	5	5
4	2	3	4	3	5	2	4	2	5	2	5	5
3	3	3	3	1	1	1	3	1	3	1	5	3
3	5	3	4	2	1	3	3	5	5	4	5	3

race	age	engnat	gender	hand	source	country	Extraversion	Neuroticism	Agreeableness	Conscientiousness	Openness
3	53	1	1	1	1	US	44	49	46	47	43
13	46	1	2	1	1	US	22	29	35	42	26
1	14	2	2	1	1	PK	35	14	38	49	45
3	19	2	2	1	1	RO	22	17	37	26	41
11	25	2	2	1	2	US	34	30	44	34	34
13	31	1	2	1	2	US	16	36	36	31	33