library(haven)
## Warning: package 'haven' was built under R version 3.4.2
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
## `curl` package not installed, falling back to using `url()`
names(ipums) #print the column names
##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "ownershp"  "ownershpd"
## [11] "bedrooms"  "pernum"    "perwt"     "famsize"   "nchild"   
## [16] "nchlt5"    "eldch"     "nsibs"     "relate"    "related"  
## [21] "sex"       "age"       "marst"     "birthyr"   "fertyr"   
## [26] "race"      "raced"     "hispan"    "hispand"   "bpl"      
## [31] "bpld"      "citizen"   "yrsusa1"   "language"  "languaged"
## [36] "speakeng"  "educ"      "educd"     "empstat"   "empstatd" 
## [41] "labforce"  "occ"       "ind"       "inctot"    "incwage"  
## [46] "poverty"   "hwsei"     "migrate1"  "migrate1d" "carpool"  
## [51] "trantime"

Wages are selected as a continuous outcome and dependent variable for this analysis while commute time is selected as the continous predictor or independent variable.

library(broom)
library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
newpums2<-ipums%>%
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA, incwage))%>%
filter(age>=18, met2013==21340)
fitx<-lm(trantime~mywage, data=newpums2)
coef(fitx)
## (Intercept)      mywage 
## 7.991873129 0.000267356
ggplot(newpums2, aes(x=mywage, y=trantime))+geom_point()+geom_smooth(method = "lm", se = FALSE)

This model shows that as transit time or commute time increases, wage is most likely going to remain low. Wages are more likely to increase if the commute time is under 50 minutes.

 summary(fitx)
## 
## Call:
## lm(formula = trantime ~ mywage, data = newpums2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.790  -7.992  -7.992   3.666 150.340 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.992e+00  1.056e+00   7.571  1.6e-13 ***
## mywage      2.674e-04  3.422e-05   7.813  2.9e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.59 on 545 degrees of freedom
## Multiple R-squared:  0.1007, Adjusted R-squared:  0.09907 
## F-statistic: 61.04 on 1 and 545 DF,  p-value: 2.897e-14
confint((fitx))
##                    2.5 %       97.5 %
## (Intercept) 5.9182304592 1.006552e+01
## mywage      0.0002001354 3.345766e-04

The T value of 7.51 and estimate of -2.67 shows that there is a signicant negative relationship between commute time and wages.

Evaluating the assumptions

plot(fitx, which=1)

In this plot, we can see that there is a pattern of points of lower fitted values that create a cluster that experiences less variance. The spread of the residuals increases with the low fitted values. This indicates a non-constant variation.

library(lmtest)
## Warning: package 'lmtest' was built under R version 3.4.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.2
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(fitx)
## 
##  studentized Breusch-Pagan test
## 
## data:  fitx
## BP = 5.2314, df = 1, p-value = 0.02218

Normality of residuals

plot(fitx, which=2)

In this plot, we see a lot of variation that deviate from the diagonal line particularly at the upper and lower tail.

shapiro.test(resid(fitx))
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(fitx)
## W = 0.55058, p-value < 2.2e-16
ks.test(resid(fitx), y = pnorm)
## Warning in ks.test(resid(fitx), y = pnorm): ties should not be present for
## the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  resid(fitx)
## D = 0.64001, p-value < 2.2e-16
## alternative hypothesis: two-sided

The p value indicates of 2.2e-16 indicates heteroskedasticity.

Based on the analysis between wages and commutting time, we cannot conclude that commuting time has siginificant effect on wages due to the variability observed in the data. We would need to be include in this model other variables that may have a higher impact such as educational attainment or gender.