Hyejin Roh (h12428201)


1. Find data


2. Import data

mydata <- read.table("./Household.csv", header=TRUE, sep=",")

3. Display data

head(mydata)
##   CapitalRegion Assets Liabilities NetAssets
## 1            G1      1        5300     -5299
## 2            G1     10           0        10
## 3            G1     10           0        10
## 4            G1     10           0        10
## 5            G1     10           0        10
## 6            G1    100           0       100

4. Explain data


5. Data source


6-1. Data manipulation

library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
set.seed(2024)

mydata$CapitalRegion <- factor(mydata$CapitalRegion,
                            levels = c("G1", "G2"),
                            labels = c("Capital", "NonCapital"))

Capital <- mydata %>%
  filter(CapitalRegion == "Capital", NetAssets > 0)
Capital <- Capital[sample(nrow(Capital), 30), ]

NonCapital <- mydata %>%
  filter(CapitalRegion == "NonCapital", NetAssets > 0)
NonCapital <- NonCapital[sample(nrow(NonCapital), 30), ]

sample.mydata <- rbind(Capital, NonCapital)
sample.mydata <- sample.mydata[, c("CapitalRegion", "NetAssets")]
sample.mydata$LogNetAssets <- log(sample.mydata$NetAssets)

sample.mydata
##       CapitalRegion NetAssets LogNetAssets
## 5698        Capital     38300    10.553205
## 4645        Capital     54600    10.907789
## 3629        Capital     42552    10.658482
## 4796        Capital     66500    11.104957
## 5375        Capital     74396    11.217157
## 5009        Capital      7066     8.863050
## 2669        Capital     29400    10.288750
## 3488        Capital     40168    10.600826
## 105         Capital    102300    11.535665
## 1762        Capital     20220     9.914427
## 5119        Capital     55126    10.917377
## 5149        Capital     76740    11.248178
## 1035        Capital    150400    11.921054
## 4880        Capital    688300    13.441980
## 398         Capital    113768    11.641917
## 2978        Capital     18575     9.829572
## 3098        Capital      2420     7.791523
## 3634        Capital    426000    12.962195
## 3104        Capital     24940    10.124228
## 2773        Capital     30338    10.320156
## 4299        Capital     33000    10.404263
## 35          Capital     29740    10.300248
## 1385        Capital     16810     9.729729
## 4334        Capital      5500     8.612503
## 2641        Capital     29000    10.275051
## 3356        Capital     31266    10.350287
## 3288        Capital     36740    10.511621
## 3668        Capital      4313     8.369389
## 3548        Capital     25990    10.165467
## 1451        Capital      1800     7.495542
## 9658     NonCapital     60980    11.018301
## 8052     NonCapital      4565     8.426174
## 10516    NonCapital      7217     8.884195
## 8119     NonCapital     25400    10.142504
## 9303     NonCapital     57450    10.958670
## 6649     NonCapital     35650    10.481504
## 10858    NonCapital     64994    11.082050
## 1916     NonCapital     14913     9.609989
## 10116    NonCapital      6653     8.802823
## 9863     NonCapital      6350     8.756210
## 4020     NonCapital     22630    10.027032
## 212      NonCapital     42880    10.666161
## 1305     NonCapital    128358    11.762579
## 1849     NonCapital    138935    11.841761
## 4913     NonCapital      2656     7.884577
## 11574    NonCapital       900     6.802395
## 8026     NonCapital     32610    10.392374
## 9137     NonCapital       556     6.320768
## 3715     NonCapital     19519     9.879144
## 4249     NonCapital     23410    10.060919
## 8897     NonCapital      4560     8.425078
## 3860     NonCapital       145     4.976734
## 2441     NonCapital     12268     9.414750
## 9935     NonCapital     64336    11.071875
## 8656     NonCapital      5100     8.536996
## 4085     NonCapital     22910    10.039329
## 2234     NonCapital      5056     8.528331
## 2353     NonCapital     16200     9.692767
## 10867    NonCapital     59650    10.996249
## 1155     NonCapital    110864    11.616060

6-2. Data manipulation - Descriptive statistics

library(psych)

describeBy(sample.mydata$LogNetAssets, group = sample.mydata$CapitalRegion)
## 
##  Descriptive statistics by group 
## group: Capital
##    vars  n mean   sd median trimmed  mad min   max range  skew kurtosis   se
## X1    1 30 10.4 1.31  10.38   10.42 0.81 7.5 13.44  5.95 -0.11      0.3 0.24
## ------------------------------------------------------------ 
## group: NonCapital
##    vars  n mean   sd median trimmed  mad  min   max range  skew kurtosis  se
## X1    1 30 9.57 1.63   9.95    9.74 1.62 4.98 11.84  6.87 -0.88      0.4 0.3

7-1. Statistical hypothesis test

Research Question
  • Is there a difference in the average net assets between households in the Capital Region and those in the Non Capital Region?

Research Hypothesis
  • \(H_0: \mu_{capital} = \mu_{noncapital}\)
  • \(H_1: \mu_{capital} \neq \mu_{noncapital}\)

Assumptions
  • Variable is numeric: log(Net Assets) is numeric
  • Normality: check by histograms & Shapiro-Wilk test
  • For independent samples t-test:
    • Independence: households in the capital/non-capital region are independent
    • Homogeneity of variance: check by Levene’s test
library(ggplot2)
## 
## 다음의 패키지를 부착합니다: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
ggplot(sample.mydata, aes(x = LogNetAssets)) +
  geom_histogram(binwidth = 0.5, colour="gray") +
  facet_wrap(~CapitalRegion, ncol = 1) + 
  ylab("Amount")

  • check whether the data is normally distributed (based on the histograms)
  • assume normally distributed
library(rstatix)
## 
## 다음의 패키지를 부착합니다: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
sample.mydata %>%
  group_by(CapitalRegion) %>%
  shapiro_test(LogNetAssets)
## # A tibble: 2 × 4
##   CapitalRegion variable     statistic      p
##   <fct>         <chr>            <dbl>  <dbl>
## 1 Capital       LogNetAssets     0.954 0.215 
## 2 NonCapital    LogNetAssets     0.932 0.0564
Shapiro Test
  • whether the data is normally distributed

  • \(H_0\): log(Net Assets) is normally distributed

  • \(H_1\): log(Net Assets) is NOT normally distributed

  • Capital Region:

    • p-value = 0.215 > 0.05 \(\rightarrow\) can’t reject \(H_0\)
    • data is normally distributed

  • Non Capital Region:

    • p-value = 0.057 > 0.05 \(\rightarrow\) can’t reject \(H_0\)
    • data is normally distributed
       
library(car)
## 필요한 패키지를 로딩중입니다: carData
## 
## 다음의 패키지를 부착합니다: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
leveneTest(sample.mydata$LogNetAssets, group = sample.mydata$CapitalRegion)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  1  1.6048 0.2103
##       58
Levene’s Test
  • whether the variance is the same
  • \(H_0: \sigma^2_{capital} = \sigma^2_{noncapital}\)
  • \(H_1: \sigma^2_{capital} \neq \sigma^2_{noncapital}\)
  • p-value = 0.2103 > 0.05 \(\rightarrow\) can’t reject \(H_0\)
  • assume the variance is the same

7-2. Statistical hypothesis test - parametric test(Independent t-test)

t.test(sample.mydata$LogNetAssets ~ sample.mydata$CapitalRegion,
       var.equal = TRUE,
       alternative = "two.sided")
## 
##  Two Sample t-test
## 
## data:  sample.mydata$LogNetAssets by sample.mydata$CapitalRegion
## t = 2.1785, df = 58, p-value = 0.03344
## alternative hypothesis: true difference in means between group Capital and group NonCapital is not equal to 0
## 95 percent confidence interval:
##  0.06752773 1.59635847
## sample estimates:
##    mean in group Capital mean in group NonCapital 
##                10.401886                 9.569943
library(effectsize)
## 
## 다음의 패키지를 부착합니다: 'effectsize'
## The following objects are masked from 'package:rstatix':
## 
##     cohens_d, eta_squared
## The following object is masked from 'package:psych':
## 
##     phi
effectsize::cohens_d(sample.mydata$LogNetAssets ~ sample.mydata$CapitalRegion,
                     pooled_sd = FALSE)
## Cohen's d |       95% CI
## ------------------------
## 0.56      | [0.04, 1.08]
## 
## - Estimated using un-pooled SD.
interpret_cohens_d(0.56, rules = "sawilowsky2009")
## [1] "medium"
## (Rules: sawilowsky2009)

7-3. Statistical hypothesis test - non-parametric test(Wilcoxon Rank Sum Test)

wilcox.test(sample.mydata$LogNetAssets ~ sample.mydata$CapitalRegion,
            paired = FALSE,
            correct = FALSE,
            exact = FALSE,
            alternative = "two.sided")
## 
##  Wilcoxon rank sum test
## 
## data:  sample.mydata$LogNetAssets by sample.mydata$CapitalRegion
## W = 577, p-value = 0.06043
## alternative hypothesis: true location shift is not equal to 0
library(effectsize)

effectsize(wilcox.test(sample.mydata$LogNetAssets ~ sample.mydata$CapitalRegion,
                       paired = FALSE,
                       correct = FALSE,
                       exact = FALSE,
                       alternative = "two.sided"))
## r (rank biserial) |        95% CI
## ---------------------------------
## 0.28              | [ 0.00, 0.53]
interpret_rank_biserial(0.28)
## [1] "medium"
## (Rules: funder2019)

7-4. Conclusion