# # 2. Importing our ISSP data  

# read .csv file using the readr package from tidyverse
df <- read.csv("H:\\Kennsla\\TUOS-R\\TUOS-R 2023\\Gögn\\Environment.issp.csv") 

This is an R Markdown document containing R functions provided by participants in the ESTP online course Tidying Up Official Statistics 2024.

1. ggplot function from the ggplot2 package in the tidyverse collection

WHY?

The average age that participated in the survey of ISSP:

library(tidyverse)
n_df <- df |> 
  dplyr::filter(!is.na(SEX)) |> 
  mutate(SEX = recode(SEX,
                      "1" = "Male", 
                      "2" = "Female")) |> 
  group_by(c_alphan, SEX) |> 
  summarize(age_ave = mean(AGE, na.rm = T))
          
n_df |> 
  ggplot(aes(c_alphan, age_ave, color = SEX)) +
  geom_point() +
  xlab("Country") +
  ylab("Average age") + 
  guides(color = guide_legend("Gender")) + 
  theme_classic()


2. missForest from the missForest package

WHY?

library(missForest)

# Generate synthetic data with missing values
set.seed(123)
n <- 20  # Number of observations
p <- 5    # Number of variables
data <- matrix(rnorm(n * p), ncol = p)
# Introduce missing values
data[sample(1:(n * p), 20)] <- NA

# Convert the data to a data frame
data <- as.data.frame(data)
df_imp <- missForest(data)

df_imp$ximp  #imputed values in the imputed dataframe 
##             V1          V2          V3          V4           V5
## 1  -0.56047565 -1.06782371 -0.69470698  0.37963948  0.005764186
## 2  -0.23017749 -0.26276367  0.37568206 -0.50232345  0.385280401
## 3   1.55870831 -1.02600445 -1.26539635 -0.33320738 -0.370660032
## 4   0.07050839 -0.72889123  2.16895597 -1.01857538 -0.120765359
## 5   0.12928774 -0.62503927  1.20796200 -1.07179123 -0.220486562
## 6   1.71506499 -1.68669331 -1.12310858  0.30352864  0.331781964
## 7   0.46091621  0.83778704 -0.40288484  0.39968193  1.096839013
## 8  -0.04338035  0.15337312  0.06498841  0.05300423  0.913642385
## 9  -0.68685285 -1.13813694 -0.14455737  0.92226747  0.249540120
## 10 -0.44566197  0.53297442 -0.08336907  2.05008469  1.148807618
## 11  0.02739941  0.42646422  0.25331851 -0.49103117  0.993503856
## 12  0.35981383 -0.29507148  0.49704824 -2.30916888  0.548396960
## 13  0.40077145  0.89512566 -0.04287046  1.00573852  0.238731735
## 14  0.11068272  0.87813349  0.32629135 -0.70920076  0.632106861
## 15 -0.55584113  0.82158108 -0.14433752  1.01320583  1.360652449
## 16  0.30182500 -0.14733027  1.51647060  1.02557137 -0.600259587
## 17  0.49785048  0.55391765 -1.54875280 -0.28477301  2.187332993
## 18 -1.96661716 -0.06191171  0.58461375 -1.22071771  1.532610626
## 19  0.70135590 -0.30596266 -0.03235465  0.18130348 -0.235700359
## 20  0.56584685 -0.38047100  0.21594157 -0.13889136 -1.026420900
df_imp$OOBerror #out-of-bag error 
##     NRMSE 
## 0.9720304


3. lag function from the base R in combination with tidyverse

WHY?

#Create data frame
year = c(2020, 2020, 2020, 2020, 2021, 2021, 2021, 2021)
id <- c("A", "B", "C", "D","A", "B", "C", "D")
value <- c(1:8)

data <- data.frame(year, id, value)

#Look up value from previous period
data <- data |> 
  arrange(id,year) |> 
  group_by(id) |> 
  mutate(value1 = lag(value, 1, order_by = year))

print(data)
## # A tibble: 8 × 4
## # Groups:   id [4]
##    year id    value value1
##   <dbl> <chr> <int>  <int>
## 1  2020 A         1     NA
## 2  2021 A         5      1
## 3  2020 B         2     NA
## 4  2021 B         6      2
## 5  2020 C         3     NA
## 6  2021 C         7      3
## 7  2020 D         4     NA
## 8  2021 D         8      4


4. The function str_pad from stringr package

WHY?

# create id numbers with 10 and 9 digits
id_numbers <- tibble(id = c("1012992255", "912993366"))

# str_pad on left side
id_numbers |> 
  mutate(id_correct = str_pad(string = id, width = 10, side= "left", pad = "0"))
## # A tibble: 2 × 2
##   id         id_correct
##   <chr>      <chr>     
## 1 1012992255 1012992255
## 2 912993366  0912993366
# another example using symbols
hirearchy <- tibble(level = c(1, 2, 1, 2),
                    value = c("A", "A1", "B", "B1"))
hirearchy |> 
  mutate(level_sym = str_pad(string = "", 
                             width = level,
                             pad = "@"))
## # A tibble: 4 × 3
##   level value level_sym
##   <dbl> <chr> <chr>    
## 1     1 A     @        
## 2     2 A1    @@       
## 3     1 B     @        
## 4     2 B1    @@


5. Pipe from base R |> and pipe from tidyverse %>%

The pipe takes the thing on its left and passes it along to the function on its right so that x |> f(y) is equivalent to f(x, y), and x |> f(y) |> g(z) is equivalent to g(f(x, y), z)

WHY?

Base R pipe |> was introduced in R 4.1.0 in 2021, and it is little bit more elegant than original pipe %>% from 2014.

For overview of differences, please see: https://www.tidyverse.org/blog/2023/04/base-vs-magrittr-pipe/


6. evGui function from the extremevalues package

WHY?

library(tidyverse)
library(extremevalues)

# Load data

mydata <- as.data.frame(cbind(1:10,
                              c("m", "m", "m", "m",
                                "m", "f", "f", "f",
                                "f", "f"),
                              c(0,0,1,4,5,2,1,1,0,3),
                    c(17,12,9,29,21,80,38,240,70,60)))

names(mydata) <- c("number", "gender", "no_of_children", "age")
mydata$age <- as.numeric(mydata$age)


# plot data
plot(mydata$age)

# Use evGui() function to explore the missing data pattern for the "age" variable

#evGui(mydata$age) 
# a pop up GUI will appear to interact with

# I - the value above which less than rho observations are expected
#II - residuals are above or below a confidence limit alpha
# applying evGui to datasets
x <- getOutliers(mydata$age, method="I", distribution="lognormal")

outlierPlot(mydata$age, x, mode="qq")

print(mydata[x$iRight,])
##   number gender no_of_children age
## 8      8      f              1 240


7. theme() from ggplot 2

WHY?

# creating theme
mytheme <- theme(panel.background = element_rect(fill = "ghostwhite"),
                 plot.background = element_rect(fill = "ghostwhite"), 
                 axis.title.x = element_text(color="lightskyblue4", size=10), 
                 axis.title.y = element_text(color = "lightskyblue4", size = 10))

# plot 
  n_df |>
    ggplot(aes(c_alphan, age_ave, color = SEX)) +
    geom_point()+
    mytheme


8. freq () and descr() functions in the summarytools package

WHY?

Example

The below example code works off the following example dataframe (mydf):

#Create data frame
ID <- c("1", "2", "3", "4","5", "6", "7", "8", "9")
SEX <- c("Male", "Female", "Male", "Female","Male", "Female","Male", "Female","Male")
wrkhrs_avg <- c(39.25,33.98,40.53, 31.22, 43.26, 32.36, 51.85, 46.91, 41.25)
Commute_km <- c(39, 28, 3, 36, 3, 21, 32, 7, 31)
Likes_job <- c(1, 1, 0, 1, 0, 0, 0, 1, 1)
JOB <- c("Teacher","Teacher","Teacher","Teacher","Teacher","Teacher","Teacher","Teacher","Teacher" )
weight <- c(169, 687, 349, 575, 117, 274, 745, 348, 532)

mydf <- data.frame(ID, SEX, wrkhrs_avg, Commute_km, Likes_job, JOB, weight)

freq/descr function in summarytools package

library(summarytools) 

View(mydf) #uppercase "V"iew for dataframe 

Calculate frequency of each job type

#### calculate frequency of each job type #### 

JOB_FREQ <- freq(mydf, JOB) 

view(JOB_FREQ) #lowercase view for summarytools object

Example

## repeat with weights 

JOB_FREQ_WT <- freq(mydf, JOB, weights = mydf$weight) 

view(JOB_FREQ_WT) 

Example

Grouped by sex

#### Grouped by sex using stby #### 

## WEIGHTED 

JOB_bysex_wt <- with(mydf, stby(JOB, SEX, freq, weights = weight)) 

view(JOB_bysex_wt) 


# convert results for females to dataframe and add new “SEX” column 

JOB_FEMALE_wt <- as.data.frame(JOB_bysex_wt[1]) %>% 

  mutate("Sex" = "Female") 

View(JOB_FEMALE_wt) 

Example

Calculate descriptive stats for commute_km

#### calculate descriptive stats for commute_km #### 

commute_descr <- descr(mydf, Commute_km) 

## repeat with weights 

commute_descr_WT <- descr(mydf, Commute_km, weights = mydf$weight) 

view(commute_descr_WT) 

Example

More examples to try out

#### Group by Likes_job #### 

## WEIGHTED 

commute_group_wt <- with(mydf, stby(Commute_km, Likes_job , descr, weights = weight)) 

commute_group_wt 
## Weighted Descriptive Statistics  
## Commute_km by Likes_job  
## Data Frame: mydf  
## Weights: weight  
## N: 4  
## 
##                         0         1
## --------------- --------- ---------
##            Mean     15.90     24.27
##         Std.Dev     13.00     11.96
##             Min      3.00      7.00
##          Median     16.12     28.22
##             Max     32.00     39.00
##             MAD     19.63     10.57
##              CV      0.82      0.49
##         N.Valid   1780.00   1897.00
##       Pct.Valid     46.89     49.97
commute_likesjob <- as.data.frame(commute_group_wt[2]) 

## can also carry out these commands for all variables in mydf at once 

freq(mydf) #ignores numeric variables 
## Frequencies  
## mydf$ID  
## Type: Character  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           1      1     11.11          11.11     11.11          11.11
##           2      1     11.11          22.22     11.11          22.22
##           3      1     11.11          33.33     11.11          33.33
##           4      1     11.11          44.44     11.11          44.44
##           5      1     11.11          55.56     11.11          55.56
##           6      1     11.11          66.67     11.11          66.67
##           7      1     11.11          77.78     11.11          77.78
##           8      1     11.11          88.89     11.11          88.89
##           9      1     11.11         100.00     11.11         100.00
##        <NA>      0                               0.00         100.00
##       Total      9    100.00         100.00    100.00         100.00
## 
## mydf$SEX  
## Type: Character  
## 
##                Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ------------ ------ --------- -------------- --------- --------------
##       Female      4     44.44          44.44     44.44          44.44
##         Male      5     55.56         100.00     55.56         100.00
##         <NA>      0                               0.00         100.00
##        Total      9    100.00         100.00    100.00         100.00
## 
## mydf$wrkhrs_avg  
## Type: Numeric  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##       31.22      1     11.11          11.11     11.11          11.11
##       32.36      1     11.11          22.22     11.11          22.22
##       33.98      1     11.11          33.33     11.11          33.33
##       39.25      1     11.11          44.44     11.11          44.44
##       40.53      1     11.11          55.56     11.11          55.56
##       41.25      1     11.11          66.67     11.11          66.67
##       43.26      1     11.11          77.78     11.11          77.78
##       46.91      1     11.11          88.89     11.11          88.89
##       51.85      1     11.11         100.00     11.11         100.00
##        <NA>      0                               0.00         100.00
##       Total      9    100.00         100.00    100.00         100.00
## 
## mydf$Commute_km  
## Type: Numeric  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           3      2     22.22          22.22     22.22          22.22
##           7      1     11.11          33.33     11.11          33.33
##          21      1     11.11          44.44     11.11          44.44
##          28      1     11.11          55.56     11.11          55.56
##          31      1     11.11          66.67     11.11          66.67
##          32      1     11.11          77.78     11.11          77.78
##          36      1     11.11          88.89     11.11          88.89
##          39      1     11.11         100.00     11.11         100.00
##        <NA>      0                               0.00         100.00
##       Total      9    100.00         100.00    100.00         100.00
## 
## mydf$Likes_job  
## Type: Numeric  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           0      4     44.44          44.44     44.44          44.44
##           1      5     55.56         100.00     55.56         100.00
##        <NA>      0                               0.00         100.00
##       Total      9    100.00         100.00    100.00         100.00
## 
## mydf$JOB  
## Type: Character  
## 
##                 Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ------------- ------ --------- -------------- --------- --------------
##       Teacher      9    100.00         100.00    100.00         100.00
##          <NA>      0                               0.00         100.00
##         Total      9    100.00         100.00    100.00         100.00
## 
## mydf$weight  
## Type: Numeric  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##         117      1     11.11          11.11     11.11          11.11
##         169      1     11.11          22.22     11.11          22.22
##         274      1     11.11          33.33     11.11          33.33
##         348      1     11.11          44.44     11.11          44.44
##         349      1     11.11          55.56     11.11          55.56
##         532      1     11.11          66.67     11.11          66.67
##         575      1     11.11          77.78     11.11          77.78
##         687      1     11.11          88.89     11.11          88.89
##         745      1     11.11         100.00     11.11         100.00
##        <NA>      0                               0.00         100.00
##       Total      9    100.00         100.00    100.00         100.00
descr(mydf, weights = mydf$weight) #ignores non-numeric variables 
## Weighted Descriptive Statistics  
## mydf  
## Weights: weight  
## N: 9  
## 
##                   wrkhrs_avg   Commute_km   Likes_job
## --------------- ------------ ------------ -----------
##            Mean        40.28        25.41        0.61
##         Std.Dev         7.41        11.55        0.49
##             Min        31.22         3.00        0.00
##          Median        40.56        30.30        1.00
##             Max        51.85        39.00        1.00
##             MAD         9.95         5.63        0.00
##              CV         0.18         0.45        0.80
##         N.Valid      3796.00      3796.00     3796.00
##       Pct.Valid       100.00       100.00      100.00
#stby(mydf, mydf$Likes_job , freq, weights = mydf$weight)##doesn't work  

##can only look at one variable at a time when using stby() with freq() 

##descr() + stby() does work for multiple variables at once 

stby(mydf, mydf$Likes_job , descr, weights = mydf$weight) 
## Weighted Descriptive Statistics  
## mydf  
## Weights: weight  
## Group: Likes_job = 0  
## N: 4  
## 
##                   wrkhrs_avg   Commute_km   Likes_job
## --------------- ------------ ------------ -----------
##            Mean        43.64        15.90        0.00
##         Std.Dev         6.91        13.00        0.00
##             Min        32.36         3.00        0.00
##          Median        43.65        16.12        0.00
##             Max        51.85        32.00        0.00
##             MAD         7.02        19.63        0.00
##              CV         0.16         0.82         NaN
##         N.Valid      1780.00      1780.00     1780.00
##       Pct.Valid        46.89        46.89       46.89
## 
## Group: Likes_job = 1  
## N: 5  
## 
##                   wrkhrs_avg   Commute_km   Likes_job
## --------------- ------------ ------------ -----------
##            Mean        38.31        24.27        1.00
##         Std.Dev         6.25        11.96        0.00
##             Min        31.22         7.00        1.00
##          Median        37.13        28.22        1.00
##             Max        46.91        39.00        1.00
##             MAD         6.49        10.57        0.00
##              CV         0.16         0.49        0.00
##         N.Valid      1897.00      1897.00     1897.00
##       Pct.Valid        49.97        49.97       49.97


9. Pivot_longer()

WHY?


Example of using pivot_longer() to deal with data on lending standards

Source Recommendation of the European Systemic Risk Board of 31 October 2016 on closing real estate data gaps (ESRB/2016/14) (europa.eu)

```