Dataset: Big5

Vist the website to know more details about the dataset: https://openpsychometrics.org/

Openness - People who like to learn new things and enjoy new experiences usually score high in openness. Openness includes traits like being insightful and imaginative and having a wide variety of interests.

Conscientiousness - People that have a high degree of conscientiousness are reliable and prompt. Traits include being organized, methodic, and thorough.

Extraversion - Extraverts get their energy from interacting with others, while introverts get their energy from within themselves. Extraversion includes the traits of energetic, talkative, and assertive.

Agreeableness - These individuals are friendly, cooperative, and compassionate. People with low agreeableness may be more distant. Traits include being kind, affectionate, and sympathetic.

Neuroticism - Neuroticism is also sometimes called Emotional Stability. This dimension relates to one’s emotional stability and degree of negative emotions. People that score high on neuroticism often experience emotional instability and negative emotions. Traits include being moody and tense.

Load the dataset

library(tidyverse)
df <- read_tsv("big5.csv") 

Subset the dataset

# Here, I will focus on columns age, country of residence, and extraversion which labeled as E1, E2, …, E10.
df <- na.omit(df)  
df <- select(df,'age','country', 'E1':'O10')
head(df)
## # A tibble: 6 x 52
##     age country    E1    E2    E3    E4    E5    E6    E7    E8    E9   E10
##   <int> <chr>   <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1    53 US          4     2     5     2     5     1     4     3     5     1
## 2    46 US          2     2     3     3     3     3     1     5     1     5
## 3    14 PK          5     1     1     4     5     1     1     5     5     1
## 4    19 RO          2     5     2     4     3     4     3     4     4     5
## 5    25 US          3     1     3     3     3     1     3     1     3     5
## 6    31 US          1     5     2     4     1     3     2     4     1     5
## # ... with 40 more variables: N1 <int>, N2 <int>, N3 <int>, N4 <int>,
## #   N5 <int>, N6 <int>, N7 <int>, N8 <int>, N9 <int>, N10 <int>, A1 <int>,
## #   A2 <int>, A3 <int>, A4 <int>, A5 <int>, A6 <int>, A7 <int>, A8 <int>,
## #   A9 <int>, A10 <int>, C1 <int>, C2 <int>, C3 <int>, C4 <int>, C5 <int>,
## #   C6 <int>, C7 <int>, C8 <int>, C9 <int>, C10 <int>, O1 <int>, O2 <int>,
## #   O3 <int>, O4 <int>, O5 <int>, O6 <int>, O7 <int>, O8 <int>, O9 <int>,
## #   O10 <int>

Data transformation

# gather extraversion
df_e <- df %>% 
  select(1:17)%>%
  gather('E1':'E10',
         key = 'Extraversion',value = 'E_Value')

# gather neuroticism
df_n <- df %>% 
  select('N1':'N10')%>%
  gather('N1':'N10',
         key = 'Neuroticism',value = 'N_Value') 

# gather agreeableness
df_a <- df %>% 
  select('A1':'A10')%>%
  gather('A1':'A10',
         key = 'Agreeableness',value = 'A_Value') 

# gather conscientiousness
df_c <- df %>% 
  select('C1':'C10')%>%
  gather('C1':'C10',
         key = 'Conscientiousness',value = 'C_Value') 

# gather openness
df_o <- df %>% 
  select('O1':'O10')%>%
  gather('O1':'O10',
         key = 'Openness',value = 'O_Value')

# combine dataset
l_df <- bind_cols(df_a,df_c,df_e,df_n,df_o)

head(l_df)
## # A tibble: 6 x 17
##   Agreeableness A_Value Conscientiousne~ C_Value   age country    N1    N2
##   <chr>           <int> <chr>              <int> <int> <chr>   <int> <int>
## 1 A1                  1 C1                     4    53 US          1     5
## 2 A1                  1 C1                     4    46 US          2     3
## 3 A1                  5 C1                     4    14 PK          5     1
## 4 A1                  2 C1                     3    19 RO          5     4
## 5 A1                  5 C1                     3    25 US          3     3
## 6 A1                  2 C1                     2    31 US          1     5
## # ... with 9 more variables: N3 <int>, N4 <int>, N5 <int>,
## #   Extraversion <chr>, E_Value <int>, Neuroticism <chr>, N_Value <int>,
## #   Openness <chr>, O_Value <int>
tail(l_df)
## # A tibble: 6 x 17
##   Agreeableness A_Value Conscientiousne~ C_Value   age country    N1    N2
##   <chr>           <int> <chr>              <int> <int> <chr>   <int> <int>
## 1 A10                 3 C10                    4    26 US          2     5
## 2 A10                 5 C10                    4    15 SG          5     3
## 3 A10                 3 C10                    3    37 US          2     4
## 4 A10                 3 C10                    5    16 US          5     1
## 5 A10                 3 C10                    5    16 NG          4     3
## 6 A10                 4 C10                    5    35 US          5     2
## # ... with 9 more variables: N3 <int>, N4 <int>, N5 <int>,
## #   Extraversion <chr>, E_Value <int>, Neuroticism <chr>, N_Value <int>,
## #   Openness <chr>, O_Value <int>
Then we can do further analysis of the impact of extraversion base on the long form dataset.
l_df %>%
  select()
## # A tibble: 197,100 x 0
# One example is that we can plot the top 30 countries which have the largest mean value of the agreeableness
l_df %>%
  group_by(country) %>%
  summarise(avg=mean(A_Value)) %>%
  head(30) %>%
  ggplot(aes(x=reorder(country,avg),y=avg))+
  geom_bar(stat = 'identity',aes(fill = country))+
  labs(title = 'Country of the agreeableness')+
  xlab("Country")+
  ylab("Avg")+
  coord_flip()+
  theme(legend.position='none')


Dataset: Avocado Prices

http://www.hassavocadoboard.com/retail/volume-and-price-data

Load the dataset

avocado <- read_csv("avocado.csv") 

Data transformation

avocado <- avocado %>% 
  gather(ProductCode, Volume,4:6)

head(avocado)
## # A tibble: 6 x 9
##   Date       AveragePrice `Total Volume` `Total Bags` `Small Bags`
##   <date>            <dbl>          <dbl>        <dbl>        <dbl>
## 1 2018-12-02         1.22      27652426.    10151173.     7174281.
## 2 2018-11-25         1.25      21855494.     8290842.     6016143.
## 3 2018-11-18         1.14      29871135.    12382973.     9174635.
## 4 2018-11-11         1         39042283.    14698246.    10721367.
## 5 2018-11-04         1.01      38363283.    15222197.    10602849.
## 6 2018-10-28         1.01      39402227.    15865346.    11437563.
## # ... with 4 more variables: `Large Bags` <dbl>, `XLarge Bags` <dbl>,
## #   ProductCode <chr>, Volume <dbl>
tail(avocado)
## # A tibble: 6 x 9
##   Date       AveragePrice `Total Volume` `Total Bags` `Small Bags`
##   <date>            <dbl>          <dbl>        <dbl>        <dbl>
## 1 2018-02-11         0.97      43167806.    15102427.    10844852.
## 2 2018-02-04         0.87      62505647.    19373134.    13384587.
## 3 2018-01-28         1.09      40171641.    12923982.     9749412.
## 4 2018-01-21         1.08      42939822.    13862460.     9866218.
## 5 2018-01-14         1.2       37299945.    12180021.     8128242.
## 6 2018-01-07         1.13      36703157.    11513110.     8231766.
## # ... with 4 more variables: `Large Bags` <dbl>, `XLarge Bags` <dbl>,
## #   ProductCode <chr>, Volume <dbl>


Dataset: UNICEF dataset

The UNICEF dataset on under-five child mortality across 196 countries.
Vist the website to know more details about the dataset: http://www.childmortality.org/

Load the dataset

u5mr <- read_csv("unicef-u5mr.csv") 

Data transformation

l_u5mr <- u5mr %>%
    gather(year, u5mr, 'U5MR 1950':'U5MR 2015') %>%
    mutate(year = as.numeric(gsub("U5MR ", "", year)))

head(l_u5mr)
## # A tibble: 6 x 3
##   CountryName        year  u5mr
##   <chr>             <dbl> <dbl>
## 1 Afghanistan        1950    NA
## 2 Albania            1950    NA
## 3 Algeria            1950    NA
## 4 Andorra            1950    NA
## 5 Angola             1950    NA
## 6 Antigua & Barbuda  1950    NA
tail(l_u5mr)
## # A tibble: 6 x 3
##   CountryName  year  u5mr
##   <chr>       <dbl> <dbl>
## 1 Uruguay      2015  10.1
## 2 Uzbekistan   2015  39.1
## 3 Venezuela    2015  14.9
## 4 Samoa        2015  17.5
## 5 Yemen        2015  41.9
## 6 Zambia       2015  64