Data_Analysis_and_Decision_Making

#Assignment 7
#Exercise 1 

#Create factors, remove NAs
#Process the data to make factors with named levels for race, smsa, and pt.
#Replace the indicator variables ne, mw, so, and we with one factor called region with levels named: ne, mw, so, and we*.
#Remove all NAs.

#load data
#1(a)
library("faraway")

require(car)

## Loading required package: car

## 
## Attaching package: 'car'

## The following objects are masked from 'package:faraway':
## 
##     logit, vif

data("uswages")
summary(uswages)

##       wage              educ           exper            race      
##  Min.   :  50.39   Min.   : 0.00   Min.   :-2.00   Min.   :0.000  
##  1st Qu.: 308.64   1st Qu.:12.00   1st Qu.: 8.00   1st Qu.:0.000  
##  Median : 522.32   Median :12.00   Median :15.00   Median :0.000  
##  Mean   : 608.12   Mean   :13.11   Mean   :18.41   Mean   :0.078  
##  3rd Qu.: 783.48   3rd Qu.:16.00   3rd Qu.:27.00   3rd Qu.:0.000  
##  Max.   :7716.05   Max.   :18.00   Max.   :59.00   Max.   :1.000  
##       smsa             ne              mw               so        
##  Min.   :0.000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.756   Mean   :0.229   Mean   :0.2485   Mean   :0.3125  
##  3rd Qu.:1.000   3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##        we             pt        
##  Min.   :0.00   Min.   :0.0000  
##  1st Qu.:0.00   1st Qu.:0.0000  
##  Median :0.00   Median :0.0000  
##  Mean   :0.21   Mean   :0.0925  
##  3rd Qu.:0.00   3rd Qu.:0.0000  
##  Max.   :1.00   Max.   :1.0000

uswages$exper[uswages$exper <0] <-NA

#1(b)
# convert race, smsa, and pt to factor variables
uswages$race <- factor(uswages$race)
levels(uswages$race) <- c("White","Black")
uswages$smsa <- factor(uswages$smsa)
levels(uswages$smsa) <- c("No","Yes")
uswages$pt <- factor(uswages$pt)
levels(uswages$pt) <- c("No","Yes")

#1(c)
summary(uswages)

##       wage              educ           exper          race       smsa     
##  Min.   :  50.39   Min.   : 0.00   Min.   : 0.00   White:1844   No : 488  
##  1st Qu.: 308.64   1st Qu.:12.00   1st Qu.: 8.00   Black: 156   Yes:1512  
##  Median : 522.32   Median :12.00   Median :16.00                          
##  Mean   : 608.12   Mean   :13.11   Mean   :18.74                          
##  3rd Qu.: 783.48   3rd Qu.:16.00   3rd Qu.:27.00                          
##  Max.   :7716.05   Max.   :18.00   Max.   :59.00                          
##                                    NA's   :33                             
##        ne              mw               so               we      
##  Min.   :0.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00  
##  Median :0.000   Median :0.0000   Median :0.0000   Median :0.00  
##  Mean   :0.229   Mean   :0.2485   Mean   :0.3125   Mean   :0.21  
##  3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.00  
##  Max.   :1.000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00  
##                                                                  
##    pt      
##  No :1815  
##  Yes: 185  
##            
##            
##            
##            
##

# create region, a factor variable based on the four regions ne, mw, so, we
uswages <- data.frame(uswages,
                      region =
                        1*uswages$ne +
                        2*uswages$mw +
                        3*uswages$so +
                        4*uswages$we)
uswages$region <- factor(uswages$region)
levels(uswages$region) <- c("ne","mw","so","we")

# delete the four regions ne, mw, so, we
uswages <- subset(uswages,select=-c(ne:we))

#1(d)
# Take care of NAs
uswages <- na.omit(uswages)

#Exercise 2
summary(uswages)

##       wage              educ           exper          race       smsa     
##  Min.   :  50.39   Min.   : 0.00   Min.   : 0.00   White:1812   No : 483  
##  1st Qu.: 314.69   1st Qu.:12.00   1st Qu.: 8.00   Black: 155   Yes:1484  
##  Median : 522.32   Median :12.00   Median :16.00                          
##  Mean   : 613.99   Mean   :13.08   Mean   :18.74                          
##  3rd Qu.: 783.48   3rd Qu.:16.00   3rd Qu.:27.00                          
##  Max.   :7716.05   Max.   :18.00   Max.   :59.00                          
##    pt       region  
##  No :1802   ne:448  
##  Yes: 165   mw:488  
##             so:616  
##             we:415  
##                     
##

#Exercise 3
#Compute OLS fit to model log(wage)~.
#Perform the Shapiro-Wilk Test of Normality for the residuals, what is the conclusion?

g1 = lm(log(wage) ~ ., data = uswages)
summary(g1)

## 
## Call:
## lm(formula = log(wage) ~ ., data = uswages)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5158 -0.3309  0.0504  0.3520  3.9446 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.710039   0.076052  61.932  < 2e-16 ***
## educ         0.087515   0.004494  19.472  < 2e-16 ***
## exper        0.015483   0.001016  15.243  < 2e-16 ***
## raceBlack   -0.215273   0.048723  -4.418 1.05e-05 ***
## smsaYes      0.177741   0.030177   5.890 4.54e-09 ***
## ptYes       -1.067418   0.046344 -23.032  < 2e-16 ***
## regionmw     0.008320   0.037489   0.222    0.824    
## regionso     0.010080   0.035945   0.280    0.779    
## regionwe     0.046846   0.038922   1.204    0.229    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5686 on 1958 degrees of freedom
## Multiple R-squared:  0.368,  Adjusted R-squared:  0.3654 
## F-statistic: 142.5 on 8 and 1958 DF,  p-value: < 2.2e-16

shapiro.test(residuals(g1))

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(g1)
## W = 0.96353, p-value < 2.2e-16

#The Shapiro-Wilk test rejects normality of errors.

#Exercise 4
#Compute WLS fit to model log(wage)~. and weights = 1/(1+educ)
#Perform the Shapiro-Wilk Test of Normality for the residuals, what is the conclusion?
summary(uswages)

##       wage              educ           exper          race       smsa     
##  Min.   :  50.39   Min.   : 0.00   Min.   : 0.00   White:1812   No : 483  
##  1st Qu.: 314.69   1st Qu.:12.00   1st Qu.: 8.00   Black: 155   Yes:1484  
##  Median : 522.32   Median :12.00   Median :16.00                          
##  Mean   : 613.99   Mean   :13.08   Mean   :18.74                          
##  3rd Qu.: 783.48   3rd Qu.:16.00   3rd Qu.:27.00                          
##  Max.   :7716.05   Max.   :18.00   Max.   :59.00                          
##    pt       region  
##  No :1802   ne:448  
##  Yes: 165   mw:488  
##             so:616  
##             we:415  
##                     
##

g2 = lm(log(wage) ~ ., data = uswages, weight = 1/(1+ educ))
summary(g2)

## 
## Call:
## lm(formula = log(wage) ~ ., data = uswages, weights = 1/(1 + 
##     educ))
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.69235 -0.09294  0.01296  0.09763  2.99856 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.091092   0.072223  70.491  < 2e-16 ***
## educ         0.064515   0.003791  17.019  < 2e-16 ***
## exper        0.013344   0.001078  12.375  < 2e-16 ***
## raceBlack   -0.231167   0.051227  -4.513 6.78e-06 ***
## smsaYes      0.215742   0.032822   6.573 6.30e-11 ***
## ptYes       -1.000953   0.051389 -19.478  < 2e-16 ***
## regionmw    -0.072096   0.042154  -1.710   0.0874 .  
## regionso    -0.068740   0.039576  -1.737   0.0826 .  
## regionwe    -0.088407   0.042706  -2.070   0.0386 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1744 on 1958 degrees of freedom
## Multiple R-squared:  0.2896, Adjusted R-squared:  0.2867 
## F-statistic:  99.8 on 8 and 1958 DF,  p-value: < 2.2e-16

compareCoefs(g1,g2)

## 
## Call:
## 1: lm(formula = log(wage) ~ ., data = uswages)
## 2: lm(formula = log(wage) ~ ., data = uswages, weights = 1/(1 + educ))
##               Est. 1     SE 1   Est. 2     SE 2
## (Intercept)  4.71004  0.07605  5.09109  0.07222
## educ         0.08751  0.00449  0.06452  0.00379
## exper        0.01548  0.00102  0.01334  0.00108
## raceBlack   -0.21527  0.04872 -0.23117  0.05123
## smsaYes      0.17774  0.03018  0.21574  0.03282
## ptYes       -1.06742  0.04634 -1.00095  0.05139
## regionmw     0.00832  0.03749 -0.07210  0.04215
## regionso     0.01008  0.03595 -0.06874  0.03958
## regionwe     0.04685  0.03892 -0.08841  0.04271

shapiro.test(residuals(g2))

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(g2)
## W = 0.972, p-value < 2.2e-16

#The Shapiro-Wilk test rejects normality of errors.

#Exercise 5
#Compute Robust fit to model log(wage)~. using Huber, Hampel, Biquare, LTS, and LAD
#Compare coefficients of the above fits using OLS, WLS, Huber, Hampel, Biquare, LTS, and LAD
#Which would you recommend?
#Why?
ols= lm(log(wage)~ . , data = uswages )

wls = lm(log(wage) ~ ., data = uswages, weight = 1/(1+ educ))

#Huber M - estimation
library(MASS)
huber = rlm(log(wage) ~ .,psi= psi.huber, data = uswages )

#Tukey Bisquare M-estimation
bisquare = rlm(log(wage) ~ .,psi= psi.bisquare,init="lts",maxit=100 ,data = uswages )

#Hample M-estimation
hample = rlm(log(wage) ~ .,psi= psi.hampel,init="lts",maxit=100 ,data = uswages )

#Least Trimmed Squares(LTS)
#install.packages("robustbase")
library(robustbase)

## Warning: package 'robustbase' was built under R version 3.4.2

## 
## Attaching package: 'robustbase'

## The following object is masked from 'package:faraway':
## 
##     epilepsy

lts = ltsReg(log(wage) ~ .,data = uswages )

## Warning in covMcd(X, alpha = alpha, use.correction = use.correction): The 988-th order statistic of the absolute deviation of variable 3
## is zero.
## There are 1812 observations (in the entire dataset of 1967 obs.)
## lying on the hyperplane with equation a_1*(x_i1 - m_1) + ... +
## a_p*(x_ip - m_p) = 0 with (m_1, ..., m_p) the mean of these
## observations and coefficients a_i from the vector a <- c(0, 0, 1,
## 0, 0, 0, 0, 0)

lts_exact =ltsReg(log(wage) ~ .,data = uswages ,
                  nsamp = "exact")

## Warning in .fastlts(x, y, h, nsamp, intercept, adjust, trace = as.integer(trace)): 'nsamp' options 'best' and 'exact' not allowed for n greater than 599. Will use default.

## Warning in covMcd(X, alpha = alpha, use.correction = use.correction): The 988-th order statistic of the absolute deviation of variable 3
## is zero.
## There are 1812 observations (in the entire dataset of 1967 obs.)
## lying on the hyperplane with equation a_1*(x_i1 - m_1) + ... +
## a_p*(x_ip - m_p) = 0 with (m_1, ..., m_p) the mean of these
## observations and coefficients a_i from the vector a <- c(0, 0, 1,
## 0, 0, 0, 0, 0)

#Least Absolute Deviation
library(quantreg)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

lad = rq(log(wage) ~ .,data = uswages )


coefs = compareCoefs(ols,wls,huber,bisquare,hample,lts,lts_exact,lad,se = FALSE)

## Warning in compareCoefs(ols, wls, huber, bisquare, hample, lts,
## lts_exact, : models to be compared are of different classes

## 
## Call:
## 1: lm(formula = log(wage) ~ ., data = uswages)
## 2: lm(formula = log(wage) ~ ., data = uswages, weights = 1/(1 + educ))
## 3: rlm(formula = log(wage) ~ ., data = uswages, psi = psi.huber)
## 4: rlm(formula = log(wage) ~ ., data = uswages, psi = psi.bisquare, 
##   init = "lts", maxit = 100)
## 5: rlm(formula = log(wage) ~ ., data = uswages, psi = psi.hampel, init 
##   = "lts", maxit = 100)
## 6: ltsReg.formula(formula = log(wage) ~ ., data = uswages)
## 7: ltsReg.formula(formula = log(wage) ~ ., data = uswages, nsamp = 
##   "exact")
## 8: rq(formula = log(wage) ~ ., data = uswages)
##                Est. 1    Est. 2    Est. 3    Est. 4    Est. 5    Est. 6
## (Intercept)  4.710039  5.091092  4.621799  4.579708  4.605699          
## educ         0.087515  0.064515  0.096150  0.099596  0.096683  0.100869
## exper        0.015483  0.013344  0.016229  0.016741  0.016109  0.017604
## raceBlack   -0.215273 -0.231167 -0.196352 -0.200370 -0.201430 -0.213834
## smsaYes      0.177741  0.215742  0.161001  0.158679  0.163157  0.160903
## ptYes       -1.067418 -1.000953 -1.152926 -1.172663 -1.141632 -1.240930
## regionmw     0.008320 -0.072096  0.024339  0.022086  0.013778  0.018559
## regionso     0.010080 -0.068740 -0.009966 -0.017928 -0.005755 -0.036881
## regionwe     0.046846 -0.088407  0.038043  0.030596  0.036834  0.048242
## Intercept                                                      4.568491
##                Est. 7    Est. 8
## (Intercept)            4.663960
## educ         0.101826  0.094944
## exper        0.017784  0.016968
## raceBlack   -0.200400 -0.251185
## smsaYes      0.152231  0.180681
## ptYes       -1.231272 -1.197686
## regionmw     0.026702  0.000201
## regionso    -0.035224 -0.046384
## regionwe     0.053190  0.013072
## Intercept    4.555933

colnames(coefs) = c("OLS","WLS","Huber", "Bisquare", "Hample", "LTS", "LTS-exact", "LAD")
coefs

##                     OLS         WLS        Huber    Bisquare      Hample
## (Intercept)  4.71003927  5.09109161  4.621799237  4.57970795  4.60569936
## educ         0.08751484  0.06451525  0.096150108  0.09959648  0.09668271
## exper        0.01548319  0.01334415  0.016228613  0.01674113  0.01610866
## raceBlack   -0.21527343 -0.23116674 -0.196351924 -0.20037038 -0.20143047
## smsaYes      0.17774071  0.21574205  0.161001139  0.15867881  0.16315655
## ptYes       -1.06741757 -1.00095287 -1.152926040 -1.17266284 -1.14163240
## regionmw     0.00831990 -0.07209561  0.024339415  0.02208550  0.01377789
## regionso     0.01008044 -0.06874001 -0.009966393 -0.01792789 -0.00575486
## regionwe     0.04684583 -0.08840729  0.038042793  0.03059615  0.03683405
## Intercept            NA          NA           NA          NA          NA
##                     LTS   LTS-exact          LAD
## (Intercept)          NA          NA  4.663959542
## educ         0.10086887  0.10182550  0.094943531
## exper        0.01760387  0.01778414  0.016967663
## raceBlack   -0.21383395 -0.20040017 -0.251184921
## smsaYes      0.16090299  0.15223141  0.180681034
## ptYes       -1.24093034 -1.23127227 -1.197686089
## regionmw     0.01855923  0.02670195  0.000200561
## regionso    -0.03688079 -0.03522418 -0.046384126
## regionwe     0.04824153  0.05318957  0.013072260
## Intercept    4.56849100  4.55593333           NA

#We see that LTS and LTS-exact appear to agree with each other and both are very different different from OLS.
#All three M-estimation methods, Huber, Bisquare, and Hample are different from each other, and different from OLS and both LTS's.
#LAD is similar to OLS.
#LTS is recommended since it has the best breakdown.

Data_Analysis_and_Decision_Making_7.R

aksha

Mon Oct 09 02:58:18 2017