Family_Econ_R_data

Libraries

library(dplyr) 
library(psych)
library(readxl)
library(writexl)
library(kableExtra)

# Econometrics
library(tidyverse)
library(plm) # panel models
library(sandwich) #covariance matrixes
library(lmtest) # tests
library(xtable)# latex tables
library(stargazer) # latex regression tables
library(ggpubr) # correlation test

Downloading the data

time_use_dat <- read_csv(
  'C:/Users/Popov/Documents/Research/Hosni/Family_economics/Project/Time_use_R_Py.csv',
                      show_col_types = FALSE)

time_use_dat

## # A tibble: 169,958 × 49
##    sample pernum ident  hhid   pid serial  year   age civstat cohab educ        
##     <dbl>  <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <chr>   <chr> <chr>       
##  1   1930      1 10101     1     1      1  1931    42 Married No    College Gra…
##  2   1930      1 10102     1     1      2  1931    42 Married No    College Gra…
##  3   1930      1 10103     1     1      3  1931    42 Married No    College Gra…
##  4   1930      1 10104     1     1      4  1931    42 Married No    College Gra…
##  5   1930      1 10105     1     1      5  1931    42 Married No    College Gra…
##  6   1930      1 10106     1     1      6  1931    42 Married No    College Gra…
##  7   1930      1 10107     1     1      7  1931    42 Married No    College Gra…
##  8   1930      1 20101     2     1      8  1931    47 Married No    College Gra…
##  9   1930      1 20102     2     1      9  1931    47 Married No    College Gra…
## 10   1930      1 20103     2     1     10  1931    47 Married No    College Gra…
## # ℹ 169,948 more rows
## # ℹ 38 more variables: recwght <dbl>, ethnic <chr>, hisp <chr>, ageyngst <dbl>,
## #   ownhome <chr>, state <chr>, regione <chr>, hhtype <chr>, famstat <chr>,
## #   nadult <dbl>, under5 <dbl>, under18 <dbl>, empstat <chr>, incomeqt <chr>,
## #   empsp <chr>, wagelm <dbl>, wkhrs <chr>, student <chr>, occup <chr>,
## #   homemakr <chr>, retired <chr>, nwork <dbl>, nchild <dbl>, agekid <chr>,
## #   hhldsize <dbl>, act_chcare <dbl>, act_civic <dbl>, act_educa <dbl>, …

Descriptive statistics

Desc_stat <- time_use_dat %>% psych::describe() %>% round(3)
View(Desc_stat) # coincides with stata analysis

Correlation matrix

for time use

library(ggcorrplot)

# Select only columns that start with "act_" - time use activities
act_columns <- time_use_dat %>% select(starts_with("act_"))
act_columns

## # A tibble: 169,958 × 12
##    act_chcare act_civic act_educa act_inhome act_media act_missing act_outhome
##         <dbl>     <dbl>     <dbl>      <dbl>     <dbl>       <dbl>       <dbl>
##  1         45         0         0          5        90           0           0
##  2        115         0         0        105       110           0           0
##  3         30         0         0         75       120           0         145
##  4         95        80         0          0       335           0           0
##  5         60         0         0         40       115           0           0
##  6         55        15         0         50       115           0           0
##  7         65         0         0          5        90           0           0
##  8         90         0         0          0       225           0           0
##  9         10         0         0          0       230           0           0
## 10         10         0         0        120       125           0         170
## # ℹ 169,948 more rows
## # ℹ 5 more variables: act_pcare <dbl>, act_physical <dbl>, act_travel <dbl>,
## #   act_undom <dbl>, act_work <dbl>

#visualize correlation matrix
time_use_corr_plot <- ggcorrplot(cor(act_columns))

# Save the plot to a PDF file
ggsave("time_use_corr_plot.pdf", plot = time_use_corr_plot, width = 8, height = 6)

time_use_corr_plot

### for other variables in Time_use dataset

# Select only numeric columns
int_numeric_columns <- time_use_dat %>% select_if(~ is.integer(.) | is.numeric(.))

# exclude identificators
other_columns <- int_numeric_columns %>% select(-starts_with("act_"),
      -sample, - pernum, -ident, -hhid, -pid, -serial, -year)

# see the number of missed values
sum(is.na(other_columns)) # a lot, almost 1/4

## [1] 536577

# Get the number of missing values for each column
missing_values <- colSums(is.na(other_columns))

# Display the columns and their corresponding number of missing values
missing_values

##      age  recwght ageyngst   nadult   under5  under18   wagelm    nwork 
##      833      525    81434      632     9951      714    85573    18045 
##   nchild hhldsize      sex 
##   169433   169433        4

# Exclude all columns that contain many missing values
clean_data <- other_columns[!names(other_columns) %in% c("nchild", "hhldsize", "ageyngst")]

# Remove all rows with any missing values
clean_data <- clean_data %>% drop_na()

#visualize correlation matrix
other_corr_plot <- ggcorrplot(cor(clean_data))
other_corr_plot

# Save the plot to a PDF file
ggsave("others_corr_plot.pdf", plot = other_corr_plot, width = 8, height = 6)

Family_Econ_R_data_description

Nikolai Popov

2024-10-09

Libraries

Downloading the data

Descriptive statistics

Correlation matrix

for time use