Requirements for weighted least squares regression

response= mean across level of predictor

sd(response_i|predictor_i)=TRUE

OR heteroscedasticity_test(simple_lin_reg)=FAIL

response= mean across level of predictor

sd(response_i|predictor_i)=TRUE

cleaningwtd data

clean<-read.table(here('data','cleaningwtd.txt'),header=TRUE)
head(clean)
##   Case Crews Rooms    StdDev
## 1    1    16    51 12.000463
## 2    2    10    37  7.927123
## 3    3    12    37  7.289910
## 4    4    16    46 12.000463
## 5    5    16    45 12.000463
## 6    6     4    11  4.966555

Note that the number of rooms cleaned is a mean, and the standard deviations of rooms cleaned per crew amount is given.

ggplot(clean, aes(x=Crews, y=Rooms))+geom_point()+geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'

Note how the variance increases as the number of crews increases.

Build model list

clean<-read.table(here('data','cleaningwtd.txt'),header=TRUE)
m.list<-list()
m.list[['wls_no_weight']]<-lm(Rooms~Crews,data=clean)
summary(m.list[['wls_no_weight']])
## 
## Call:
## lm(formula = Rooms ~ Crews, data = clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.9990  -4.9901   0.8046   4.0010  17.0010 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.7847     2.0965   0.851    0.399    
## Crews         3.7009     0.2118  17.472   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.336 on 51 degrees of freedom
## Multiple R-squared:  0.8569, Adjusted R-squared:  0.854 
## F-statistic: 305.3 on 1 and 51 DF,  p-value: < 2.2e-16

Check for heteroscedasticity

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
lmtest::bptest(m.list[['wls_no_weight']])
## 
##  studentized Breusch-Pagan test
## 
## data:  m.list[["wls_no_weight"]]
## BP = 14.518, df = 1, p-value = 0.0001389

H_0: homoscedasitcity is present (residuals are distributed with equal variance)

H_A: heteroscedasticity is present (residuals are not distributed with equal variance)

if lmtest::bptest(model)==H_A perform Weighted least squares

m.list[['wls']]<-lm(Rooms~Crews,weights=1/StdDev^2,data=clean)

summary(m.list[['wls']])
## 
## Call:
## lm(formula = Rooms ~ Crews, data = clean, weights = 1/StdDev^2)
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.43184 -0.82013  0.03909  0.69029  2.01030 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.8095     1.1158   0.725    0.471    
## Crews         3.8255     0.1788  21.400   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9648 on 51 degrees of freedom
## Multiple R-squared:  0.8998, Adjusted R-squared:  0.8978 
## F-statistic:   458 on 1 and 51 DF,  p-value: < 2.2e-16

Compare fits

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggplot(clean, aes(x=Crews,y=Rooms))+geom_point()+geom_abline(aes(intercept=m.list[['wls']]$coefficients[[1]],slope=m.list[['wls']]$coefficients[[2]],color='wls'))+geom_abline(aes(intercept=m.list[['wls_no_weight']]$coefficients[[1]],slope=m.list[['wls']]$coefficients[[2]],color='unweighted'))