# # 2. Importing our ISSP data
# read .csv file using the readr package from tidyverse
df <- read.csv("H:\\Kennsla\\TUOS-R\\TUOS-R 2023\\Gögn\\Environment.issp.csv")
This is an R Markdown document containing R functions provided by participants in the ESTP online course Tidying Up Official Statistics 2024.
WHY?
The average age that participated in the survey of ISSP:
library(tidyverse)
n_df <- df |>
dplyr::filter(!is.na(SEX)) |>
mutate(SEX = recode(SEX,
"1" = "Male",
"2" = "Female")) |>
group_by(c_alphan, SEX) |>
summarize(age_ave = mean(AGE, na.rm = T))
n_df |>
ggplot(aes(c_alphan, age_ave, color = SEX)) +
geom_point() +
xlab("Country") +
ylab("Average age") +
guides(color = guide_legend("Gender")) +
theme_classic()
WHY?
library(missForest)
# Generate synthetic data with missing values
set.seed(123)
n <- 20 # Number of observations
p <- 5 # Number of variables
data <- matrix(rnorm(n * p), ncol = p)
# Introduce missing values
data[sample(1:(n * p), 20)] <- NA
# Convert the data to a data frame
data <- as.data.frame(data)
df_imp <- missForest(data)
df_imp$ximp #imputed values in the imputed dataframe
## V1 V2 V3 V4 V5
## 1 -0.56047565 -1.06782371 -0.69470698 0.37963948 0.005764186
## 2 -0.23017749 -0.26276367 0.37568206 -0.50232345 0.385280401
## 3 1.55870831 -1.02600445 -1.26539635 -0.33320738 -0.370660032
## 4 0.07050839 -0.72889123 2.16895597 -1.01857538 -0.120765359
## 5 0.12928774 -0.62503927 1.20796200 -1.07179123 -0.220486562
## 6 1.71506499 -1.68669331 -1.12310858 0.30352864 0.331781964
## 7 0.46091621 0.83778704 -0.40288484 0.39968193 1.096839013
## 8 -0.04338035 0.15337312 0.06498841 0.05300423 0.913642385
## 9 -0.68685285 -1.13813694 -0.14455737 0.92226747 0.249540120
## 10 -0.44566197 0.53297442 -0.08336907 2.05008469 1.148807618
## 11 0.02739941 0.42646422 0.25331851 -0.49103117 0.993503856
## 12 0.35981383 -0.29507148 0.49704824 -2.30916888 0.548396960
## 13 0.40077145 0.89512566 -0.04287046 1.00573852 0.238731735
## 14 0.11068272 0.87813349 0.32629135 -0.70920076 0.632106861
## 15 -0.55584113 0.82158108 -0.14433752 1.01320583 1.360652449
## 16 0.30182500 -0.14733027 1.51647060 1.02557137 -0.600259587
## 17 0.49785048 0.55391765 -1.54875280 -0.28477301 2.187332993
## 18 -1.96661716 -0.06191171 0.58461375 -1.22071771 1.532610626
## 19 0.70135590 -0.30596266 -0.03235465 0.18130348 -0.235700359
## 20 0.56584685 -0.38047100 0.21594157 -0.13889136 -1.026420900
df_imp$OOBerror #out-of-bag error
## NRMSE
## 0.9720304
WHY?
#Create data frame
year = c(2020, 2020, 2020, 2020, 2021, 2021, 2021, 2021)
id <- c("A", "B", "C", "D","A", "B", "C", "D")
value <- c(1:8)
data <- data.frame(year, id, value)
#Look up value from previous period
data <- data |>
arrange(id,year) |>
group_by(id) |>
mutate(value1 = lag(value, 1, order_by = year))
print(data)
## # A tibble: 8 × 4
## # Groups: id [4]
## year id value value1
## <dbl> <chr> <int> <int>
## 1 2020 A 1 NA
## 2 2021 A 5 1
## 3 2020 B 2 NA
## 4 2021 B 6 2
## 5 2020 C 3 NA
## 6 2021 C 7 3
## 7 2020 D 4 NA
## 8 2021 D 8 4
WHY?
# create id numbers with 10 and 9 digits
id_numbers <- tibble(id = c("1012992255", "912993366"))
# str_pad on left side
id_numbers |>
mutate(id_correct = str_pad(string = id, width = 10, side= "left", pad = "0"))
## # A tibble: 2 × 2
## id id_correct
## <chr> <chr>
## 1 1012992255 1012992255
## 2 912993366 0912993366
# another example using symbols
hirearchy <- tibble(level = c(1, 2, 1, 2),
value = c("A", "A1", "B", "B1"))
hirearchy |>
mutate(level_sym = str_pad(string = "",
width = level,
pad = "@"))
## # A tibble: 4 × 3
## level value level_sym
## <dbl> <chr> <chr>
## 1 1 A @
## 2 2 A1 @@
## 3 1 B @
## 4 2 B1 @@
The pipe takes the thing on its left and passes it along to the function on its right so that x |> f(y) is equivalent to f(x, y), and x |> f(y) |> g(z) is equivalent to g(f(x, y), z)
WHY?
Base R pipe |> was introduced in R 4.1.0 in 2021, and it is little bit more elegant than original pipe %>% from 2014.
For overview of differences, please see: https://www.tidyverse.org/blog/2023/04/base-vs-magrittr-pipe/
WHY?
library(tidyverse)
library(extremevalues)
# Load data
mydata <- as.data.frame(cbind(1:10,
c("m", "m", "m", "m",
"m", "f", "f", "f",
"f", "f"),
c(0,0,1,4,5,2,1,1,0,3),
c(17,12,9,29,21,80,38,240,70,60)))
names(mydata) <- c("number", "gender", "no_of_children", "age")
mydata$age <- as.numeric(mydata$age)
# plot data
plot(mydata$age)
# Use evGui() function to explore the missing data pattern for the "age" variable
#evGui(mydata$age)
# a pop up GUI will appear to interact with
# I - the value above which less than rho observations are expected
#II - residuals are above or below a confidence limit alpha
# applying evGui to datasets
x <- getOutliers(mydata$age, method="I", distribution="lognormal")
outlierPlot(mydata$age, x, mode="qq")
print(mydata[x$iRight,])
## number gender no_of_children age
## 8 8 f 1 240
WHY?
# creating theme
mytheme <- theme(panel.background = element_rect(fill = "ghostwhite"),
plot.background = element_rect(fill = "ghostwhite"),
axis.title.x = element_text(color="lightskyblue4", size=10),
axis.title.y = element_text(color = "lightskyblue4", size = 10))
# plot
n_df |>
ggplot(aes(c_alphan, age_ave, color = SEX)) +
geom_point()+
mytheme
WHY?
Example
The below example code works off the following example dataframe (mydf):
#Create data frame
ID <- c("1", "2", "3", "4","5", "6", "7", "8", "9")
SEX <- c("Male", "Female", "Male", "Female","Male", "Female","Male", "Female","Male")
wrkhrs_avg <- c(39.25,33.98,40.53, 31.22, 43.26, 32.36, 51.85, 46.91, 41.25)
Commute_km <- c(39, 28, 3, 36, 3, 21, 32, 7, 31)
Likes_job <- c(1, 1, 0, 1, 0, 0, 0, 1, 1)
JOB <- c("Teacher","Teacher","Teacher","Teacher","Teacher","Teacher","Teacher","Teacher","Teacher" )
weight <- c(169, 687, 349, 575, 117, 274, 745, 348, 532)
mydf <- data.frame(ID, SEX, wrkhrs_avg, Commute_km, Likes_job, JOB, weight)
freq/descr function in summarytools package
almost identical code but 2 different outputs (frequency/descriptive statistics)
can be used with weighted datasets
library(summarytools)
View(mydf) #uppercase "V"iew for dataframe
Calculate frequency of each job type
#### calculate frequency of each job type ####
JOB_FREQ <- freq(mydf, JOB)
view(JOB_FREQ) #lowercase view for summarytools object
Example
## repeat with weights
JOB_FREQ_WT <- freq(mydf, JOB, weights = mydf$weight)
view(JOB_FREQ_WT)
Example
Grouped by sex
#### Grouped by sex using stby ####
## WEIGHTED
JOB_bysex_wt <- with(mydf, stby(JOB, SEX, freq, weights = weight))
view(JOB_bysex_wt)
# convert results for females to dataframe and add new “SEX” column
JOB_FEMALE_wt <- as.data.frame(JOB_bysex_wt[1]) %>%
mutate("Sex" = "Female")
View(JOB_FEMALE_wt)
Example
Calculate descriptive stats for commute_km
#### calculate descriptive stats for commute_km ####
commute_descr <- descr(mydf, Commute_km)
## repeat with weights
commute_descr_WT <- descr(mydf, Commute_km, weights = mydf$weight)
view(commute_descr_WT)
Example
More examples to try out
#### Group by Likes_job ####
## WEIGHTED
commute_group_wt <- with(mydf, stby(Commute_km, Likes_job , descr, weights = weight))
commute_group_wt
## Weighted Descriptive Statistics
## Commute_km by Likes_job
## Data Frame: mydf
## Weights: weight
## N: 4
##
## 0 1
## --------------- --------- ---------
## Mean 15.90 24.27
## Std.Dev 13.00 11.96
## Min 3.00 7.00
## Median 16.12 28.22
## Max 32.00 39.00
## MAD 19.63 10.57
## CV 0.82 0.49
## N.Valid 1780.00 1897.00
## Pct.Valid 46.89 49.97
commute_likesjob <- as.data.frame(commute_group_wt[2])
## can also carry out these commands for all variables in mydf at once
freq(mydf) #ignores numeric variables
## Frequencies
## mydf$ID
## Type: Character
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 1 1 11.11 11.11 11.11 11.11
## 2 1 11.11 22.22 11.11 22.22
## 3 1 11.11 33.33 11.11 33.33
## 4 1 11.11 44.44 11.11 44.44
## 5 1 11.11 55.56 11.11 55.56
## 6 1 11.11 66.67 11.11 66.67
## 7 1 11.11 77.78 11.11 77.78
## 8 1 11.11 88.89 11.11 88.89
## 9 1 11.11 100.00 11.11 100.00
## <NA> 0 0.00 100.00
## Total 9 100.00 100.00 100.00 100.00
##
## mydf$SEX
## Type: Character
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ------------ ------ --------- -------------- --------- --------------
## Female 4 44.44 44.44 44.44 44.44
## Male 5 55.56 100.00 55.56 100.00
## <NA> 0 0.00 100.00
## Total 9 100.00 100.00 100.00 100.00
##
## mydf$wrkhrs_avg
## Type: Numeric
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 31.22 1 11.11 11.11 11.11 11.11
## 32.36 1 11.11 22.22 11.11 22.22
## 33.98 1 11.11 33.33 11.11 33.33
## 39.25 1 11.11 44.44 11.11 44.44
## 40.53 1 11.11 55.56 11.11 55.56
## 41.25 1 11.11 66.67 11.11 66.67
## 43.26 1 11.11 77.78 11.11 77.78
## 46.91 1 11.11 88.89 11.11 88.89
## 51.85 1 11.11 100.00 11.11 100.00
## <NA> 0 0.00 100.00
## Total 9 100.00 100.00 100.00 100.00
##
## mydf$Commute_km
## Type: Numeric
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 3 2 22.22 22.22 22.22 22.22
## 7 1 11.11 33.33 11.11 33.33
## 21 1 11.11 44.44 11.11 44.44
## 28 1 11.11 55.56 11.11 55.56
## 31 1 11.11 66.67 11.11 66.67
## 32 1 11.11 77.78 11.11 77.78
## 36 1 11.11 88.89 11.11 88.89
## 39 1 11.11 100.00 11.11 100.00
## <NA> 0 0.00 100.00
## Total 9 100.00 100.00 100.00 100.00
##
## mydf$Likes_job
## Type: Numeric
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 0 4 44.44 44.44 44.44 44.44
## 1 5 55.56 100.00 55.56 100.00
## <NA> 0 0.00 100.00
## Total 9 100.00 100.00 100.00 100.00
##
## mydf$JOB
## Type: Character
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ------------- ------ --------- -------------- --------- --------------
## Teacher 9 100.00 100.00 100.00 100.00
## <NA> 0 0.00 100.00
## Total 9 100.00 100.00 100.00 100.00
##
## mydf$weight
## Type: Numeric
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 117 1 11.11 11.11 11.11 11.11
## 169 1 11.11 22.22 11.11 22.22
## 274 1 11.11 33.33 11.11 33.33
## 348 1 11.11 44.44 11.11 44.44
## 349 1 11.11 55.56 11.11 55.56
## 532 1 11.11 66.67 11.11 66.67
## 575 1 11.11 77.78 11.11 77.78
## 687 1 11.11 88.89 11.11 88.89
## 745 1 11.11 100.00 11.11 100.00
## <NA> 0 0.00 100.00
## Total 9 100.00 100.00 100.00 100.00
descr(mydf, weights = mydf$weight) #ignores non-numeric variables
## Weighted Descriptive Statistics
## mydf
## Weights: weight
## N: 9
##
## wrkhrs_avg Commute_km Likes_job
## --------------- ------------ ------------ -----------
## Mean 40.28 25.41 0.61
## Std.Dev 7.41 11.55 0.49
## Min 31.22 3.00 0.00
## Median 40.56 30.30 1.00
## Max 51.85 39.00 1.00
## MAD 9.95 5.63 0.00
## CV 0.18 0.45 0.80
## N.Valid 3796.00 3796.00 3796.00
## Pct.Valid 100.00 100.00 100.00
#stby(mydf, mydf$Likes_job , freq, weights = mydf$weight)##doesn't work
##can only look at one variable at a time when using stby() with freq()
##descr() + stby() does work for multiple variables at once
stby(mydf, mydf$Likes_job , descr, weights = mydf$weight)
## Weighted Descriptive Statistics
## mydf
## Weights: weight
## Group: Likes_job = 0
## N: 4
##
## wrkhrs_avg Commute_km Likes_job
## --------------- ------------ ------------ -----------
## Mean 43.64 15.90 0.00
## Std.Dev 6.91 13.00 0.00
## Min 32.36 3.00 0.00
## Median 43.65 16.12 0.00
## Max 51.85 32.00 0.00
## MAD 7.02 19.63 0.00
## CV 0.16 0.82 NaN
## N.Valid 1780.00 1780.00 1780.00
## Pct.Valid 46.89 46.89 46.89
##
## Group: Likes_job = 1
## N: 5
##
## wrkhrs_avg Commute_km Likes_job
## --------------- ------------ ------------ -----------
## Mean 38.31 24.27 1.00
## Std.Dev 6.25 11.96 0.00
## Min 31.22 7.00 1.00
## Median 37.13 28.22 1.00
## Max 46.91 39.00 1.00
## MAD 6.49 10.57 0.00
## CV 0.16 0.49 0.00
## N.Valid 1897.00 1897.00 1897.00
## Pct.Valid 49.97 49.97 49.97
WHY?
Example of using pivot_longer() to deal with data on lending standards
```