micro group hw

Visuals

library(haven)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(plyr)
------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
------------------------------------------------------------------------------

Attaching package: 'plyr'
The following objects are masked from 'package:dplyr':

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize
data <- read_por("C:/Users/User/Desktop/ZA4350_v2-0-0.por")
all_data <- read_por("C:/Users/User/Desktop/ZA4350_v2-0-0.por")


all_data <- all_data %>%
  dplyr::rename(
    want_to_work = V4,
    age = AGE,
    country_code = C_ALPHAN,
    educ_years = EDUCYRS,
    hompop = HOMPOP,
    hcomp = HHCYCLE,
    gender = SEX,
    working_hours = WRKHRS,
    work_type = WRKTYPE,
    supervizes = WRKSUP,
    union =UNION,
    occupation_full = ISCO88,
    location = URBRURAL,
    partner = COHAB,
    marital = MARITAL,
    partner_emp = SPWRKST
  )


wt_existing_levels <- sort(unique(all_data$country_code))
print(wt_existing_levels)
 [1] "AU"     "BE-FLA" "BG"     "CA"     "CH"     "CY"     "CZ"     "DE-E"  
 [9] "DE-W"   "DK"     "DO"     "ES"     "FI"     "FR"     "GB-GBN" "HU"    
[17] "IE"     "IL"     "JP"     "KR"     "LV"     "MX"     "NL"     "NO"    
[25] "NZ"     "PH"     "PT"     "RU"     "SE"     "SI"     "TW"     "US"    
[33] "ZA"    
all_data$occupation <- mapvalues(
  substring(all_data$occupation_full, 1, 1), 
  from = c(1, 2, 3, 4, 5, 6, 7, 8, 9),
  to = c(
    "LEGISLATORS, SENIOR OFFICIALS AND MANAGERS",
    "PROFESSIONALS",
    "TECHNICIANS AND ASSOCIATE PROFESSIONALS",
    "CLERKS",
    "SERVICE WORKERS AND SHOP AND MARKET SALES WORKERS",
    "SKILLED AGRICULTURAL AND FISHERY WORKERS",
    "CRAFT AND RELATED TRADES WORKERS",
    "PLANT AND MACHINE OPERATORS AND ASSEMBLERS",
    "ELEMENTARY OCCUPATIONS"
  )
)
c_code <- "XX"  


country_data <- filter(all_data, country_code == c_code, working_hours > 0)
print(paste("Initial observations in country", nrow(country_data)))
[1] "Initial observations in country 0"
subset <- country_data[, c(
  "working_hours", 
  "age", 
  "hompop", 
  "hcomp", 
  "gender",
  "work_type",
  "supervizes",
  "occupation_full",
  "occupation",
  "location"
)]
library(haven)
library(dplyr)    
library(forcats)


wt_existing_levels <- sort(unique(subset$work_type))
subset$work_type <- as_factor(subset$work_type)


subset$own_earnings <- as.numeric(country_data[[paste(c_code, "RINC", sep = "_")]])
subset$family_earnings <- as.numeric(country_data[[paste(c_code, "INC", sep = "_")]])


subset$gender <- factor(ifelse(subset$gender == 2, "female", "male"))


subset_def <- subset %>% filter(complete.cases(.))


subset_def <- subset_def %>%
  mutate(
    earn_per_one = family_earnings / hompop,
    earn_ratio = own_earnings / family_earnings
  )


code_with_kids <- c(2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28)
code_when_one_adult_with_kids <- c(2, 3, 4)

subset_def <- subset_def %>%
  mutate(
    with_kids = factor(ifelse(hcomp %in% code_with_kids, "yes", "no")),
    one_adult_with_kids = factor(ifelse(hcomp %in% code_when_one_adult_with_kids, "yes", "no")),
    living_alone = factor(ifelse(hcomp == 1, "yes", "no"))
  )


subset_def$occupation <- as_factor(subset_def$occupation)

subset_def
# A tibble: 0 × 17
# ℹ 17 variables: working_hours <dbl+lbl>, age <dbl+lbl>, hompop <dbl+lbl>,
#   hcomp <dbl+lbl>, gender <fct>, work_type <fct>, supervizes <dbl+lbl>,
#   occupation_full <dbl+lbl>, occupation <fct>, location <dbl+lbl>,
#   own_earnings <dbl>, family_earnings <dbl>, earn_per_one <dbl>,
#   earn_ratio <dbl>, with_kids <fct>, one_adult_with_kids <fct>,
#   living_alone <fct>
library(ggplot2)
library(reshape2)

selected_countries <- c("JP", "FR", "FI")
filtered_data <- all_data[all_data$country_code %in% selected_countries & all_data$hcomp <= 20, ]

# Create the table and melt it
heatmap_data <- table(filtered_data$country_code, filtered_data$hcomp)
heatmap_data_melted <- melt(heatmap_data)

# Plot the heatmap
ggplot(heatmap_data_melted, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  labs(
    title = "Household Composition Heatmap for Selected Countries",
    x = "Country Code",
    y = "Household Composition",
    fill = "Count"
  ) +
  scale_fill_gradient(low = "white", high = "steelblue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

text

ggplot(filter(all_data, country_code %in% c("JP", "FR", "FI")), 
       aes(x = country_code, fill = factor(union))) +
  geom_bar(position = "fill") +
  labs(title = "Union Membership Proportion by Country", 
       x = "Country", y = "Proportion") +
  theme_minimal()

ggplot(filter(all_data, country_code %in% c("JP", "FR", "FI")), 
       aes(x = factor(hcomp), y = working_hours, color = country_code)) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~ country_code) +
  labs(title = "Working Hours by Household Composition", 
       x = "Household Composition", y = "Working Hours") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5)) 
Warning: Removed 1526 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(filter(all_data, country_code %in% c("JP", "FR", "FI")), aes(x = age, fill = country_code)) +
  geom_histogram(alpha = 0.7) +
  labs(title = "Distribution of Age (Selected Countries)", x = "Age", y = "Count") +
  theme_minimal() +
  facet_wrap(~ country_code)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(filter(all_data, country_code %in% c("JP", "FR", "FI"), educ_years <= 30), aes(x = educ_years, fill = country_code)) +
  geom_histogram(alpha = 0.7) +
  labs(title = "Distribution of Education Years (Selected Countries)", x = "Years of Education", y = "Count") +
  theme_minimal() +
  facet_wrap(~ country_code)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library(dplyr)
library(ggplot2)


filtered_data <- all_data %>%
  filter(country_code %in% c("JP", "FR", "FI"))


ggplot(filtered_data, aes(x = working_hours, fill = country_code)) +
  geom_histogram(alpha = 0.6) +
  facet_wrap(~ country_code) +
  labs(title = "Working Hours by Country", x = "Working Hours", y = "Count") +
  theme_minimal()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 1526 rows containing non-finite outside the scale range
(`stat_bin()`).

gender

filtered_data <- filtered_data %>%
  filter(!is.na(gender)) %>%  # Remove rows with NA in gender
  mutate(gender = factor(gender, levels = c(1, 2), labels = c("Male", "Female")))


ggplot(filtered_data, aes(x = gender, fill = gender)) +
    geom_bar(color = "black", alpha = 0.8) +
    facet_wrap(~ country_code) +
    scale_fill_manual(values = c("Male" = "#FF9999", "Female" = "#9999FF")) + # Assign colors to factor levels
    labs(
        title = "Gender Distribution by Country",
        x = "Gender",
        y = "Count"
    ) +
    theme_minimal()

gender ad=nd working hours

filtered_data <- all_data %>%
    filter(country_code %in% c("JP", "FR", "FI")) %>%
    select(country_code, gender, working_hours)

filtered_data <- filtered_data %>%
    filter(!is.na(gender) & !is.na(working_hours)) %>%
    mutate(
        gender = factor(gender, levels = c(1, 2), labels = c("Male", "Female"))
    )


library(ggplot2)
ggplot(filtered_data, aes(x = gender, y = working_hours, fill = gender)) +
    geom_boxplot(color = "black", alpha = 0.7) +
    facet_wrap(~ country_code) +
    labs(
        title = "Gender Distribution and Working Hours by Country",
        x = "Gender",
        y = "Working Hours"
    ) +
    theme_minimal() +
    scale_fill_manual(values = c("lightblue", "pink"))

library(ggplot2)

# Filter for Japan, France, and Finland
selected_countries <- c("JP", "FR", "FI")
filtered_data <- all_data[all_data$country_code %in% selected_countries, ]

ggplot(filtered_data, aes(x = factor(hcomp), fill = country_code)) +
  geom_bar(position = "dodge") +
  labs(
    title = "Distribution of Household Composition by Country",
    x = "Household Composition",
    y = "Count",
    fill = "Country Code"
  ) +
  theme_minimal()

# Select relevant columns for income data
income_data <- filtered_data[, c("country_code", "FI_RINC", "FR_RINC", "JP_RINC")]

# Melt the data for ggplot
library(reshape2)
income_data_melted <- melt(income_data, id.vars = "country_code", variable.name = "country", value.name = "income")
Warning: attributes are not identical across measure variables; they will be
dropped
# Create the boxplot for income distribution by country
ggplot(income_data_melted, aes(x = country, y = income, fill = country)) + 
  geom_boxplot() + 
  labs(
    title = "Distribution of Income by Country",
    x = "Country",
    y = "Income"
  ) + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10))
Warning: Removed 8592 rows containing non-finite outside the scale range
(`stat_boxplot()`).

updated

# Filter data for the selected countries
selected_countries <- c("FI", "FR", "JP")
filtered_data <- all_data[all_data$country_code %in% selected_countries, ]

# Remove rows where income for Finland (FI_RINC) is greater than 5000
filtered_data <- filtered_data[!(filtered_data$country_code == "FI" & filtered_data$FI_RINC > 5000), ]

# Select relevant columns for income data
income_data <- filtered_data[, c("country_code", "FI_RINC", "FR_RINC", "JP_RINC")]

# Melt the data for ggplot
library(reshape2)
income_data_melted <- melt(income_data, id.vars = "country_code", variable.name = "country", value.name = "income")
Warning: attributes are not identical across measure variables; they will be
dropped
# Create the boxplot for income distribution by country
ggplot(income_data_melted, aes(x = country, y = income, fill = country)) + 
  geom_boxplot() + 
  labs(
    title = "Distribution of Income by Country",
    x = "Country",
    y = "Income"
  ) + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10))
Warning: Removed 8522 rows containing non-finite outside the scale range
(`stat_boxplot()`).

income for cointries

ggplot(filtered_data, aes(x = working_hours, y = FI_RINC)) + 
  geom_point(aes(color = country_code)) + 
  labs(
    title = "Income vs. Working Hours (Finland)",
    x = "Working Hours",
    y = "Income"
  ) + 
  theme_minimal()
Warning: Removed 3183 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(filtered_data, aes(x = working_hours, y = FR_RINC, color = country_code)) + 
  geom_point() + 
  labs(
    title = "Income vs. Working Hours (France)",
    x = "Working Hours",
    y = "Income"
  ) + 
  theme_minimal()
Warning: Removed 2797 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(filtered_data, aes(x = working_hours, y = JP_RINC, color = country_code)) + 
  geom_point() + 
  labs(
    title = "Income vs. Working Hours (Japan)",
    x = "Working Hours",
    y = "Income"
  ) + 
  theme_minimal()
Warning: Removed 3452 rows containing missing values or values outside the scale range
(`geom_point()`).