library(readr)
library(dplyr)

Data is imported

final_df <- read_csv("http://www.personal.psu.edu/dlp/w540/ipums.csv")
## Parsed with column specification:
## cols(
##   AGE = col_integer(),
##   GENDER = col_integer(),
##   CTZUSIN = col_integer(),
##   SALARY = col_integer()
## )

Filter for age group older than 34 and younger than 66

final_df_filtered <- final_df %>%
  select(AGE, GENDER, CTZUSIN, SALARY) %>%
  filter(AGE > 34 & AGE < 66)

Question #1 Does SALARY differ by GENDER ?
The NULL hypothesis is that there is no difference in SALARY by GENDER. The alternative hypotheses is that there is a correlation between SALARY and GENDER.

The results indicate that there is a difference. The sample means are 1497152.9 916008.2 - big difference.

a <- t.test(final_df_filtered$SALARY[final_df_filtered$GENDER== 1], final_df_filtered$SALARY[final_df_filtered$GENDER== 2])
a
## 
##  Welch Two Sample t-test
## 
## data:  final_df_filtered$SALARY[final_df_filtered$GENDER == 1] and final_df_filtered$SALARY[final_df_filtered$GENDER == 2]
## t = 23.393, df = 51010, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  532452.3 629837.2
## sample estimates:
## mean of x mean of y 
## 1497152.9  916008.2

Question #2 Does SALARY differ by CTZUSIN?

The NULL hypothesis is that there is not difference in SALARY by CTZUSIN. The alternative hypothesis is that there is a difference in SALARY based on citizenship status.

The test shows that there is a significant difference.

b <- t.test(final_df_filtered$SALARY[final_df_filtered$CTZUSIN== 0], final_df_filtered$SALARY[final_df_filtered$CTZUSIN== 1])
b
## 
##  Welch Two Sample t-test
## 
## data:  final_df_filtered$SALARY[final_df_filtered$CTZUSIN == 0] and final_df_filtered$SALARY[final_df_filtered$CTZUSIN == 1]
## t = -7.296, df = 7461.4, p-value = 3.27e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -348135.7 -200680.3
## sample estimates:
## mean of x mean of y 
##  905567.9 1179975.8

Question #3 Is there a relationship between SALARY (dependent variable) and AGE, GENDER, CTZUSIN (independent variables)?

The NULL hypothesis is that there is no correlation between SALARY and the other 3 variables. The alternative hypothesis is that there is a correlation among the variables to SALARY. The strongest correlation is with AGE and the weakest with CTXUSIN.

cor(final_df_filtered)
##                AGE      GENDER     CTZUSIN      SALARY
## AGE     1.00000000  0.07170198  0.22245527  0.18672466
## GENDER  0.07170198  1.00000000 -0.04237633 -0.09305419
## CTZUSIN 0.22245527 -0.04237633  1.00000000  0.02515491
## SALARY  0.18672466 -0.09305419  0.02515491  1.00000000

Question #4 Does SALARY of males (Gender = 2) differ by thier citizenship status, CTZUSIN? The NULL hypothesis is that there is no correlation between citizenship status and SALARY. The alternative hypotheses is that there is a correlation between citizenship status and SALARY. There is a difference.

final_males <- final_df_filtered %>% 
  select(AGE, GENDER, CTZUSIN, SALARY) %>%
  filter(GENDER == 2)

c <- t.test(final_males$SALARY[final_males$CTZUSIN== 0], final_males$SALARY[final_males$CTZUSIN== 1])
c
## 
##  Welch Two Sample t-test
## 
## data:  final_males$SALARY[final_males$CTZUSIN == 0] and final_males$SALARY[final_males$CTZUSIN == 1]
## t = -10.038, df = 5395, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -448223.9 -301753.8
## sample estimates:
## mean of x mean of y 
##  577417.0  952405.8

Graphic for question #4

mylm <- lm(SALARY ~ CTZUSIN, data = final_males)
summary(mylm)
## 
## Call:
## lm(formula = SALARY ~ CTZUSIN, data = final_males)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -952406 -877406 -843406 -802406 9422581 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   577417      43801  13.183  < 2e-16 ***
## CTZUSIN       374989      46095   8.135 4.23e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2727000 on 39941 degrees of freedom
## Multiple R-squared:  0.001654,   Adjusted R-squared:  0.001629 
## F-statistic: 66.18 on 1 and 39941 DF,  p-value: 4.235e-16
plot(mylm)