library(dplyr)
library(psych)
library(readxl)
library(writexl)
library(kableExtra)
# Econometrics
library(tidyverse)
library(plm) # panel models
library(sandwich) #covariance matrixes
library(lmtest) # tests
library(xtable)# latex tables
library(stargazer) # latex regression tables
library(ggpubr) # correlation test
time_use_dat <- read_csv(
'C:/Users/Popov/Documents/Research/Hosni/Family_economics/Project/Time_use_R_Py.csv',
show_col_types = FALSE)
time_use_dat
## # A tibble: 169,958 × 49
## sample pernum ident hhid pid serial year age civstat cohab educ
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 1930 1 10101 1 1 1 1931 42 Married No College Gra…
## 2 1930 1 10102 1 1 2 1931 42 Married No College Gra…
## 3 1930 1 10103 1 1 3 1931 42 Married No College Gra…
## 4 1930 1 10104 1 1 4 1931 42 Married No College Gra…
## 5 1930 1 10105 1 1 5 1931 42 Married No College Gra…
## 6 1930 1 10106 1 1 6 1931 42 Married No College Gra…
## 7 1930 1 10107 1 1 7 1931 42 Married No College Gra…
## 8 1930 1 20101 2 1 8 1931 47 Married No College Gra…
## 9 1930 1 20102 2 1 9 1931 47 Married No College Gra…
## 10 1930 1 20103 2 1 10 1931 47 Married No College Gra…
## # ℹ 169,948 more rows
## # ℹ 38 more variables: recwght <dbl>, ethnic <chr>, hisp <chr>, ageyngst <dbl>,
## # ownhome <chr>, state <chr>, regione <chr>, hhtype <chr>, famstat <chr>,
## # nadult <dbl>, under5 <dbl>, under18 <dbl>, empstat <chr>, incomeqt <chr>,
## # empsp <chr>, wagelm <dbl>, wkhrs <chr>, student <chr>, occup <chr>,
## # homemakr <chr>, retired <chr>, nwork <dbl>, nchild <dbl>, agekid <chr>,
## # hhldsize <dbl>, act_chcare <dbl>, act_civic <dbl>, act_educa <dbl>, …
Desc_stat <- time_use_dat %>% psych::describe() %>% round(3)
View(Desc_stat) # coincides with stata analysis
library(ggcorrplot)
# Select only columns that start with "act_" - time use activities
act_columns <- time_use_dat %>% select(starts_with("act_"))
act_columns
## # A tibble: 169,958 × 12
## act_chcare act_civic act_educa act_inhome act_media act_missing act_outhome
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 45 0 0 5 90 0 0
## 2 115 0 0 105 110 0 0
## 3 30 0 0 75 120 0 145
## 4 95 80 0 0 335 0 0
## 5 60 0 0 40 115 0 0
## 6 55 15 0 50 115 0 0
## 7 65 0 0 5 90 0 0
## 8 90 0 0 0 225 0 0
## 9 10 0 0 0 230 0 0
## 10 10 0 0 120 125 0 170
## # ℹ 169,948 more rows
## # ℹ 5 more variables: act_pcare <dbl>, act_physical <dbl>, act_travel <dbl>,
## # act_undom <dbl>, act_work <dbl>
#visualize correlation matrix
time_use_corr_plot <- ggcorrplot(cor(act_columns))
# Save the plot to a PDF file
ggsave("time_use_corr_plot.pdf", plot = time_use_corr_plot, width = 8, height = 6)
time_use_corr_plot
### for other variables in Time_use dataset
# Select only numeric columns
int_numeric_columns <- time_use_dat %>% select_if(~ is.integer(.) | is.numeric(.))
# exclude identificators
other_columns <- int_numeric_columns %>% select(-starts_with("act_"),
-sample, - pernum, -ident, -hhid, -pid, -serial, -year)
# see the number of missed values
sum(is.na(other_columns)) # a lot, almost 1/4
## [1] 536577
# Get the number of missing values for each column
missing_values <- colSums(is.na(other_columns))
# Display the columns and their corresponding number of missing values
missing_values
## age recwght ageyngst nadult under5 under18 wagelm nwork
## 833 525 81434 632 9951 714 85573 18045
## nchild hhldsize sex
## 169433 169433 4
# Exclude all columns that contain many missing values
clean_data <- other_columns[!names(other_columns) %in% c("nchild", "hhldsize", "ageyngst")]
# Remove all rows with any missing values
clean_data <- clean_data %>% drop_na()
#visualize correlation matrix
other_corr_plot <- ggcorrplot(cor(clean_data))
other_corr_plot
# Save the plot to a PDF file
ggsave("others_corr_plot.pdf", plot = other_corr_plot, width = 8, height = 6)