class: center, middle, inverse, title-slide .title[ # Advanced quantitative data analysis ] .subtitle[ ## R basic II ] .author[ ### Mengni Chen ] .institute[ ### Department of Sociology, University of Copenhagen ] --- #Packages today - package used in this session - tidyverse - haven - janitor ```r #we have installed "tidyverse" last week #now we need to install "haven" #install.packages("tidyverse") #for those who did not install tidyverse, please install now. install.packages("haven") install.packages("janitor") ``` ```r #let use use the two package library(tidyverse) #check packages embedded in the tidyverse: https://www.tidyverse.org/packages/ library(haven) # Read and handle SPSS, Stata & SAS data (no need to install) library(janitor) #to provide some basic tabulations ``` --- #Some vectors ```r age <- c(34, 22, 42, 12, 76) conti <- factor(x = c("Europe", "Africa", "Africa", "Asia", "S. America"), levels = c("Africa", "Asia", "Australia", "Europe", "N. America", "S. America")) employed <- c(FALSE, TRUE, TRUE, TRUE, TRUE) name <- c("Agnes", "Martin", "Hakan", "Tu", "Thais") nr_kids <- c(1, 0, 3, 0, 4) ``` --- #Data frames Data frames organize vectors of **equal length** along their indices. .pull-left[ ```r # Bind our 4 vectors along their index into a data frame. # Assign that data frame to object "Dat". (Dat <- data.frame(name, age, conti, employed, nr_kids)) ``` ``` ## name age conti employed nr_kids ## 1 Agnes 34 Europe FALSE 1 ## 2 Martin 22 Africa TRUE 0 ## 3 Hakan 42 Africa TRUE 3 ## 4 Tu 12 Asia TRUE 0 ## 5 Thais 76 S. America TRUE 4 ``` ] .pull-right[ ```r age <- c(34, 22, 42, 12, NA) name <- c("Agnes", "Martin", "Hakan", "Tu", "Thais") (Dat_wNA <- data.frame(name, age)) ``` ``` ## name age ## 1 Agnes 34 ## 2 Martin 22 ## 3 Hakan 42 ## 4 Tu 12 ## 5 Thais NA ``` ] --- #Data frames .center[Data frames are the typical "rectangular" way to organize data:] <img src="https://d33wubrfki0l68.cloudfront.net/6f1ddb544fc5c69a2478e444ab8112fb0eea23f8/91adc/images/tidy-1.png" width="80%" style="display: block; margin: auto;" > --- #Tibbles Tibbles are data frames. But they have some improved features, so we will work with them. .pull-left[ ```r Dat ``` ``` ## name age conti employed nr_kids ## 1 Agnes 34 Europe FALSE 1 ## 2 Martin 22 Africa TRUE 0 ## 3 Hakan 42 Africa TRUE 3 ## 4 Tu 12 Asia TRUE 0 ## 5 Thais 76 S. America TRUE 4 ``` ] .pull-right[ ```r # Make Dat a tibble and assign it to object "Dat", # (effectively overwriting Dat as a tibble). (Dat <- as_tibble(Dat)) ``` ``` ## # A tibble: 5 × 5 ## name age conti employed nr_kids ## <chr> <dbl> <fct> <lgl> <dbl> ## 1 Agnes 34 Europe FALSE 1 ## 2 Martin 22 Africa TRUE 0 ## 3 Hakan 42 Africa TRUE 3 ## 4 Tu 12 Asia TRUE 0 ## 5 Thais 76 S. America TRUE 4 ``` ] --- #Address single variable using **$** -- ```r # Return variable "conti" contained in tibble Dat. # (R for Data Science mentions two further commands.) Dat$conti #return the variable ``` ``` ## [1] Europe Africa Africa Asia S. America ## Levels: Africa Asia Australia Europe N. America S. America ``` -- ```r # Give a summary of numeric vector age contained in Dat. summary(Dat$age) ``` ``` ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 12.0 22.0 34.0 37.2 42.0 76.0 ``` -- ```r # Give a summary of factor vector conti contained in Dat. summary(Dat$conti) ``` ``` ## Africa Asia Australia Europe N. America S. America ## 2 1 0 1 0 1 ``` --- #`select()` several variables For more functions of `select()`, see `?select` -- .pull-left[ ```r # Select from tibble Dat the variables name and nr_kids, # and assign them to the new object Dat_small. (Dat_small1a <- select(Dat, name, nr_kids)) ``` ``` ## # A tibble: 5 × 2 ## name nr_kids ## <chr> <dbl> ## 1 Agnes 1 ## 2 Martin 0 ## 3 Hakan 3 ## 4 Tu 0 ## 5 Thais 4 ``` ] -- .pull-right[ ```r # Select from object Dat all variables that start with n, # and assign the result to a new object Dat_small. (Dat_smallb <- select(Dat, starts_with("n"))) ``` ``` ## # A tibble: 5 × 2 ## name nr_kids ## <chr> <dbl> ## 1 Agnes 1 ## 2 Martin 0 ## 3 Hakan 3 ## 4 Tu 0 ## 5 Thais 4 ``` ] --- #use select() to drop variables .pull-left[ drop variable age ```r (Dat_small2 <- select(Dat, -age))# remove variable "age", similar to drop in statat ``` ``` ## # A tibble: 5 × 4 ## name conti employed nr_kids ## <chr> <fct> <lgl> <dbl> ## 1 Agnes Europe FALSE 1 ## 2 Martin Africa TRUE 0 ## 3 Hakan Africa TRUE 3 ## 4 Tu Asia TRUE 0 ## 5 Thais S. America TRUE 4 ``` ```r # if you have more variables to drop (Dat_small2 <- select(Dat, -c(name,age))) ``` ] .pull-right[ drop variables "name" and "number of children" ```r (Dat_small3 <- select(Dat, -starts_with("n"))) ``` ``` ## # A tibble: 5 × 3 ## age conti employed ## <dbl> <fct> <lgl> ## 1 34 Europe FALSE ## 2 22 Africa TRUE ## 3 42 Africa TRUE ## 4 12 Asia TRUE ## 5 76 S. America TRUE ``` ] --- # `filter()` cases based on values in certain variables ```r # Either, use dplyr's (part of tidyverse) filter() function, # to return all cases contained in Dat with value "Africa" in conti. (Dat_small1<- dplyr::filter(Dat, conti == "Africa")) ``` ``` ## # A tibble: 2 × 5 ## name age conti employed nr_kids ## <chr> <dbl> <fct> <lgl> <dbl> ## 1 Martin 22 Africa TRUE 0 ## 2 Hakan 42 Africa TRUE 3 ``` ```r #or (Dat_small1<- filter(Dat, conti == "Africa")) ``` --- #select cases using `[]` ```r Dat ``` ``` ## # A tibble: 5 × 5 ## name age conti employed nr_kids ## <chr> <dbl> <fct> <lgl> <dbl> ## 1 Agnes 34 Europe FALSE 1 ## 2 Martin 22 Africa TRUE 0 ## 3 Hakan 42 Africa TRUE 3 ## 4 Tu 12 Asia TRUE 0 ## 5 Thais 76 S. America TRUE 4 ``` ```r (Dat_small2 <- Dat[Dat$conti == "Africa", ]) ``` ``` ## # A tibble: 2 × 5 ## name age conti employed nr_kids ## <chr> <dbl> <fct> <lgl> <dbl> ## 1 Martin 22 Africa TRUE 0 ## 2 Hakan 42 Africa TRUE 3 ``` --- # select cases using `[]` Compare three codes to understand `[,]`. -- .pull-left[ ```r Dat ``` ``` ## # A tibble: 5 × 5 ## name age conti employed nr_kids ## <chr> <dbl> <fct> <lgl> <dbl> ## 1 Agnes 34 Europe FALSE 1 ## 2 Martin 22 Africa TRUE 0 ## 3 Hakan 42 Africa TRUE 3 ## 4 Tu 12 Asia TRUE 0 ## 5 Thais 76 S. America TRUE 4 ``` ```r (Dat_small3 <- Dat[1, 3]) ``` ``` ## # A tibble: 1 × 1 ## conti ## <fct> ## 1 Europe ``` ] .pull-right[ ```r # Or use the index, to achieve the same. (Dat_small3 <- Dat[1, ]) ``` ``` ## # A tibble: 1 × 5 ## name age conti employed nr_kids ## <chr> <dbl> <fct> <lgl> <dbl> ## 1 Agnes 34 Europe FALSE 1 ``` ```r (Dat_small3 <- Dat[, 2]) ``` ``` ## # A tibble: 5 × 1 ## age ## <dbl> ## 1 34 ## 2 22 ## 3 42 ## 4 12 ## 5 76 ``` ] --- #Transform, recode & generate variables of tibbles To transform and recode simply use `$` to clarify which tibble you are referring to. ```r # Center age around the average age. (Dat$age_centered <- Dat$age - mean(Dat$age)) ``` ``` ## [1] -3.2 -15.2 4.8 -25.2 38.8 ``` ```r # Recode "Africa" to "Afrika!". (Dat$conti <- fct_recode(Dat$conti, "Afrika!" = "Africa")) ``` ``` ## [1] Europe Afrika! Afrika! Asia S. America ## Levels: Afrika! Asia Australia Europe N. America S. America ``` ```r # Devide age by its standard deviation; now it is z-standardized (mean = 0, sd = 1). (Dat$z_age <- Dat$age_centered / sd(Dat$age)) ``` ``` ## [1] -0.1305090 -0.6199178 0.1957635 -1.0277584 1.5824217 ``` You can ask chatgpt what is centered value and what is z-score. --- #Transform, recode & generate several variables of a tibble If you want to transform and recode several variables from the same tibble, get used to use `mutate()`. ```r ( Dat1 <- mutate(Dat, # Use the Dat tibble. c_nr_kids= nr_kids - mean(nr_kids), # center the "nr_kdis" variable. z_nr_kids = nr_kids / sd(nr_kids), # z-standardize nr_kids. conti = fct_recode(conti, # Recode conti. "Europa!" = "Europe", # "Europe" to "Europa!". "Asien!" = "Asia") # "Asia" to "Asien!". ) # Don't forget to close mutate's bracket ")" ) ``` ``` ## # A tibble: 5 × 9 ## name age conti employed nr_kids age_centered z_age c_nr_kids z_nr_kids ## <chr> <dbl> <fct> <lgl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 Agnes 34 Europa! FALSE 1 -3.20 -0.131 -0.6 0.550 ## 2 Martin 22 Afrika! TRUE 0 -15.2 -0.620 -1.6 0 ## 3 Hakan 42 Afrika! TRUE 3 4.8 0.196 1.4 1.65 ## 4 Tu 12 Asien! TRUE 0 -25.2 -1.03 -1.6 0 ## 5 Thais 76 S. Amer… TRUE 4 38.8 1.58 2.4 2.20 ``` <span style="color:red;">**Attention: look how RStudio structures the brackets!** --- #Conditional transform & recode To transform/recode only among certain cases, use `case_when()`. ```r ( Dat2 <- mutate( Dat, # Mutate variables contained in Dat. work=NA, work = case_when( # Start conditional recode of work, employed == FALSE ~ "not working", # 1. your condition ~ new value "not working" , employed == TRUE ~ "working" # 2. your condition ~ new value "working", ) # close case_when's bracket. ) ) ``` ``` ## # A tibble: 5 × 8 ## name age conti employed nr_kids age_centered z_age work ## <chr> <dbl> <fct> <lgl> <dbl> <dbl> <dbl> <chr> ## 1 Agnes 34 Europe FALSE 1 -3.20 -0.131 not working ## 2 Martin 22 Afrika! TRUE 0 -15.2 -0.620 working ## 3 Hakan 42 Afrika! TRUE 3 4.8 0.196 working ## 4 Tu 12 Asia TRUE 0 -25.2 -1.03 working ## 5 Thais 76 S. America TRUE 4 38.8 1.58 working ``` --- #Conditional transform & recode To transform/recode only among certain cases, use `case_when()`. ```r ( Dat3a <- mutate( Dat, # Mutate variables contained in Dat. conti = case_when( # Start conditional recode of conti, employed == FALSE ~ "Atlantis", # 1. complex condition ~ new value "Atlantis", age < 25 & nr_kids < 1 ~ "Antarctica" # 2. complex condition ~ new value "Antarctica", ) # close case_when's bracket. ) ) ``` ``` ## # A tibble: 5 × 7 ## name age conti employed nr_kids age_centered z_age ## <chr> <dbl> <chr> <lgl> <dbl> <dbl> <dbl> ## 1 Agnes 34 Atlantis FALSE 1 -3.20 -0.131 ## 2 Martin 22 Antarctica TRUE 0 -15.2 -0.620 ## 3 Hakan 42 <NA> TRUE 3 4.8 0.196 ## 4 Tu 12 Antarctica TRUE 0 -25.2 -1.03 ## 5 Thais 76 <NA> TRUE 4 38.8 1.58 ``` **Why "conti" becomes a character variable?** **Why "conti" have 2 missing values "NA"?** --- ##Conditional transform & recode Let us compare the following with the previous slide ```r ( Dat3b <- mutate( Dat, # Mutate variables contained in Dat. conti = case_when( # Start conditional recode of conti, employed == FALSE ~ "Atlantis", # 1. complex condition ~ new value "Atlantis", age < 25 & nr_kids < 1 ~ "Antarctica", # 2. complex condition ~ new value "Antarctica", TRUE ~ as.character(conti)) # close case_when's bracket. ) ) ``` ``` ## # A tibble: 5 × 7 ## name age conti employed nr_kids age_centered z_age ## <chr> <dbl> <chr> <lgl> <dbl> <dbl> <dbl> ## 1 Agnes 34 Atlantis FALSE 1 -3.20 -0.131 ## 2 Martin 22 Antarctica TRUE 0 -15.2 -0.620 ## 3 Hakan 42 Afrika! TRUE 3 4.8 0.196 ## 4 Tu 12 Antarctica TRUE 0 -25.2 -1.03 ## 5 Thais 76 S. America TRUE 4 38.8 1.58 ``` --- #Conditional transform & recode Let us see what the three lines `case_when()` application do 1. For all non-employed ~ recode conti to "Atlantis". 2. For all those who're aged< 25 and have fewer than 1 kids ~ recode conti to "Antarctica". 3. All remaining cases ~ use their original conti values, but transform them to a character vector. Remember, R is class sensitive! It will not combine numeric information into a character vector. Because we give case_when() "Atlantis" and "Antarctica" as new values, it assumes that we want to make conti a character vector. ```r ( Dat3b <- mutate( Dat, # Mutate variables contained in Dat. conti = case_when( # Start conditional recode of conti, employed == FALSE ~ "Atlantis", # 1. complex condition ~ new value "Atlantis", age < 25 & nr_kids < 1 ~ "Antarctica", # 2. complex condition ~ new value "Antarctica", TRUE ~ as.character(conti) ) # close case_when's bracket. ) ) ``` --- #import "pairfam" data - Import Stata, SPSS, SAS files: ["haven" package](https://haven.tidyverse.org/) - Import csv, tsv, fwf files: ["readr" package](https://readr.tidyverse.org/) - Import Excel's xlsx files: ["readxl" package](https://readxl.tidyverse.org/) ```r # Create an object pairfam and assign the # imported anchor1_50percent.dta to it, if you have # downloaded it into your "Advanced quant project" folder # library(haven) #make sure that you call out the "haven" package pairfam <- read_dta("anchor1_50percent_Eng.dta") pairfam ``` ``` ## # A tibble: 6,201 × 1,458 ## id demodiff wave sample pid parentidk1 parentidk2 parentidk3 ## <dbl> <dbl+lbl> <dbl+l> <dbl+l> <dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> ## 1 267206000 0 [0 non-… 1 [1 2… 1 [1 p… NA NA NA NA ## 2 112963000 0 [0 non-… 1 [1 2… 1 [1 p… NA NA NA NA ## 3 327937000 0 [0 non-… 1 [1 2… 1 [1 p… 3.28e8 NA NA NA ## 4 318656000 0 [0 non-… 1 [1 2… 1 [1 p… 3.19e8 318656101 NA NA ## 5 717889000 0 [0 non-… 1 [1 2… 1 [1 p… 7.18e8 717889101 717889101 NA ## 6 222517000 0 [0 non-… 1 [1 2… 1 [1 p… NA NA NA NA ## 7 144712000 0 [0 non-… 1 [1 2… 1 [1 p… NA NA NA NA ## 8 659357000 0 [0 non-… 1 [1 2… 1 [1 p… 6.59e8 NA NA NA ## 9 506367000 0 [0 non-… 1 [1 2… 1 [1 p… 5.06e8 506367101 NA NA ## 10 64044000 0 [0 non-… 1 [1 2… 1 [1 p… NA NA NA NA ## # ℹ 6,191 more rows ## # ℹ 1,450 more variables: parentidk4 <dbl+lbl>, parentidk5 <dbl+lbl>, ## # parentidk6 <dbl+lbl>, parentidk7 <dbl+lbl>, parentidk8 <dbl+lbl>, ## # parentidk9 <dbl+lbl>, parentidk10 <dbl+lbl>, parentidk11 <dbl+lbl>, ## # parentidk12 <dbl+lbl>, parentidk13 <dbl+lbl>, parentidk14 <dbl+lbl>, ## # parentidk15 <dbl+lbl>, sex_gen <dbl+lbl>, psex_gen <dbl+lbl>, ## # k1sex_gen <dbl+lbl>, k2sex_gen <dbl+lbl>, k3sex_gen <dbl+lbl>, … ``` --- #First steps with pairfam .pull-left[ ```r table(pairfam$sex_gen) ``` ``` ## ## 1 2 ## 3029 3172 ``` ```r tabyl(pairfam$sex_gen) #tabyl(), a function of "janitor" package ``` ``` ## pairfam$sex_gen n percent ## 1 3029 0.4884696 ## 2 3172 0.5115304 ``` ] .pull-right[ in stata you can have <img src="https://github.com/fancycmn/Slide3/blob/main/Pic2.PNG?raw=true" width="100%" style="display: block; margin: auto;"> ] labels disappeared!!! --- #First steps with pairfam ```r #Let us get labels back class(pairfam$sex_gen) ``` ``` ## [1] "haven_labelled" "vctrs_vctr" "double" ``` ```r pairfam$sex_new <- as_factor(pairfam$sex_gen) #read the variable as a factor variable table(pairfam$sex_new) ``` ``` ## ## -10 not in demodiff -7 Incomplete data ## 0 0 ## -4 Filter error / Incorrect entry -3 Does not apply ## 0 0 ## 1 Male 2 Female ## 3029 3172 ``` ```r class(pairfam$sex_new) ``` ``` ## [1] "factor" ``` --- #First steps with pairfam ```r #compare two ways of showing tables table(pairfam$sex_new) ``` ``` ## ## -10 not in demodiff -7 Incomplete data ## 0 0 ## -4 Filter error / Incorrect entry -3 Does not apply ## 0 0 ## 1 Male 2 Female ## 3029 3172 ``` ```r tabyl(pairfam$sex_new) ``` ``` ## pairfam$sex_new n percent ## -10 not in demodiff 0 0.0000000 ## -7 Incomplete data 0 0.0000000 ## -4 Filter error / Incorrect entry 0 0.0000000 ## -3 Does not apply 0 0.0000000 ## 1 Male 3029 0.4884696 ## 2 Female 3172 0.5115304 ``` --- #Importing labelled data R imports Stata and SPSS labels, but cannot handle them. We need to change each labelled variable to numeric or factor from the outset of your analysis! sex_gen is a numeric variable now after importing pairfam into R. `as_factor()` can be used to make ""sex_gen" a numeric variable into a categorical variable. --- #generate a two-way table (optional) Example: val1i7: the opinion about the statement “Marriage is a lifelong union that should not be broken.” (1=Disagree completely, 5=Agree completely) ```r pairfam$sex_new <- as_factor(pairfam$sex_gen) pairfam$sex_new <- fct_drop(pairfam$sex_new) pairfam$mar_att <- as_factor(pairfam$val1i7) pairfam$mar_att <- fct_drop(pairfam$mar_att) ``` .pull-left[ ```r table(pairfam$mar_att,pairfam$sex_new) ``` ``` ## ## 1 Male 2 Female ## -2 No answer 3 7 ## -1 Don't know 31 22 ## 1 Disagree completely 367 546 ## 2 351 449 ## 3 594 734 ## 4 569 583 ## 5 Agree completely 1114 831 ``` ] .pull-right[ ```r tabyl(pairfam, mar_att,sex_new) ``` ``` ## mar_att 1 Male 2 Female ## -2 No answer 3 7 ## -1 Don't know 31 22 ## 1 Disagree completely 367 546 ## 2 351 449 ## 3 594 734 ## 4 569 583 ## 5 Agree completely 1114 831 ``` ] --- #generate a two-way table(optional) ```r table1<- table(pairfam$mar_att,pairfam$sex_new) ``` .pull-left[ ```r prop.table(table1,margin=1)#row percentage ``` ``` ## ## 1 Male 2 Female ## -2 No answer 0.3000000 0.7000000 ## -1 Don't know 0.5849057 0.4150943 ## 1 Disagree completely 0.4019715 0.5980285 ## 2 0.4387500 0.5612500 ## 3 0.4472892 0.5527108 ## 4 0.4939236 0.5060764 ## 5 Agree completely 0.5727506 0.4272494 ``` ] .pull-right[ ```r prop.table(table1,margin=2)#column percentage ``` ``` ## ## 1 Male 2 Female ## -2 No answer 0.0009904259 0.0022068096 ## -1 Don't know 0.0102344008 0.0069356873 ## 1 Disagree completely 0.1211620997 0.1721311475 ## 2 0.1158798283 0.1415510719 ## 3 0.1961043249 0.2313997478 ## 4 0.1878507758 0.1837957125 ## 5 Agree completely 0.3677781446 0.2619798235 ``` ] --- #Take home 1. Data frames organize equally-sized vectors along their indices; Tibbles are modernized data frames. 2. The function of `$` for identifying a variable, and `[row, column]` for identifying a cell in a table 3. How to select information from a table - select() to select several variables - filter() to select observations based on values in certain variables - [] can also be used for select both variables and observations 4. Transform, recode, and generate some variables of a tibble 5. Import dataset in R --- # Important code -data.frame(): organize several vectors of equal length by their index. - as_tibble(): take a data frame and make it a Tibble. - summary(): Give a summary of an object. - select(): select several variables from a data frame/Tibble. - filter(): filter cases based on values in certain variables. - mutate(): Adds new variables and preserves existing. Good for recoding several variables. - case_when(): Conditional recode for cases filtered in complex ways. - as_factor(): Make a labelled Stata/SPSS variable a factor. --- class: center, middle #[Exercise](https://rpubs.com/fancycmn/1215957)