library(readr)
library(dplyr)
Data is imported
final_df <- read_csv("http://www.personal.psu.edu/dlp/w540/ipums.csv")
## Parsed with column specification:
## cols(
## AGE = col_integer(),
## GENDER = col_integer(),
## CTZUSIN = col_integer(),
## SALARY = col_integer()
## )
Filter for age group older than 34 and younger than 66
final_df_filtered <- final_df %>%
select(AGE, GENDER, CTZUSIN, SALARY) %>%
filter(AGE > 34 & AGE < 66)
Question #1 Does SALARY differ by GENDER ?
The NULL hypothesis is that there is no difference in SALARY by GENDER. The alternative hypotheses is that there is a correlation between SALARY and GENDER.
The results indicate that there is a difference. The sample means are 1497152.9 916008.2 - big difference.
a <- t.test(final_df_filtered$SALARY[final_df_filtered$GENDER== 1], final_df_filtered$SALARY[final_df_filtered$GENDER== 2])
a
##
## Welch Two Sample t-test
##
## data: final_df_filtered$SALARY[final_df_filtered$GENDER == 1] and final_df_filtered$SALARY[final_df_filtered$GENDER == 2]
## t = 23.393, df = 51010, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 532452.3 629837.2
## sample estimates:
## mean of x mean of y
## 1497152.9 916008.2
Question #2 Does SALARY differ by CTZUSIN?
The NULL hypothesis is that there is not difference in SALARY by CTZUSIN. The alternative hypothesis is that there is a difference in SALARY based on citizenship status.
The test shows that there is a significant difference.
b <- t.test(final_df_filtered$SALARY[final_df_filtered$CTZUSIN== 0], final_df_filtered$SALARY[final_df_filtered$CTZUSIN== 1])
b
##
## Welch Two Sample t-test
##
## data: final_df_filtered$SALARY[final_df_filtered$CTZUSIN == 0] and final_df_filtered$SALARY[final_df_filtered$CTZUSIN == 1]
## t = -7.296, df = 7461.4, p-value = 3.27e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -348135.7 -200680.3
## sample estimates:
## mean of x mean of y
## 905567.9 1179975.8
Question #3 Is there a relationship between SALARY (dependent variable) and AGE, GENDER, CTZUSIN (independent variables)?
The NULL hypothesis is that there is no correlation between SALARY and the other 3 variables. The alternative hypothesis is that there is a correlation among the variables to SALARY. The strongest correlation is with AGE and the weakest with CTXUSIN.
cor(final_df_filtered)
## AGE GENDER CTZUSIN SALARY
## AGE 1.00000000 0.07170198 0.22245527 0.18672466
## GENDER 0.07170198 1.00000000 -0.04237633 -0.09305419
## CTZUSIN 0.22245527 -0.04237633 1.00000000 0.02515491
## SALARY 0.18672466 -0.09305419 0.02515491 1.00000000
Question #4 Does SALARY of males (Gender = 2) differ by thier citizenship status, CTZUSIN? The NULL hypothesis is that there is no correlation between citizenship status and SALARY. The alternative hypotheses is that there is a correlation between citizenship status and SALARY. There is a difference.
final_males <- final_df_filtered %>%
select(AGE, GENDER, CTZUSIN, SALARY) %>%
filter(GENDER == 2)
c <- t.test(final_males$SALARY[final_males$CTZUSIN== 0], final_males$SALARY[final_males$CTZUSIN== 1])
c
##
## Welch Two Sample t-test
##
## data: final_males$SALARY[final_males$CTZUSIN == 0] and final_males$SALARY[final_males$CTZUSIN == 1]
## t = -10.038, df = 5395, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -448223.9 -301753.8
## sample estimates:
## mean of x mean of y
## 577417.0 952405.8
Graphic for question #4
mylm <- lm(SALARY ~ CTZUSIN, data = final_males)
summary(mylm)
##
## Call:
## lm(formula = SALARY ~ CTZUSIN, data = final_males)
##
## Residuals:
## Min 1Q Median 3Q Max
## -952406 -877406 -843406 -802406 9422581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 577417 43801 13.183 < 2e-16 ***
## CTZUSIN 374989 46095 8.135 4.23e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2727000 on 39941 degrees of freedom
## Multiple R-squared: 0.001654, Adjusted R-squared: 0.001629
## F-statistic: 66.18 on 1 and 39941 DF, p-value: 4.235e-16
plot(mylm)