library("faraway")
library("car")
##
## Attaching package: 'car'
## The following objects are masked from 'package:faraway':
##
## logit, vif
library("ggplot2")
library("gridExtra")
library("sqldf")
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
## dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
## Referenced from: /Library/Frameworks/R.framework/Resources/modules//R_X11.so
## Reason: image not found
## Could not load tcltk. Will use slower R code instead.
## Loading required package: RSQLite
library("nlme")
library("zoo")
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library("MASS")
require("MASS")
require("faraway")
require("car")
require("ggplot2")
require("gridExtra")
require("sqldf")
require("nlme")
require("zoo")
setwd("/Users/prashant0268/Desktop/")
## -- 1. Anova Models -- ##
HOUST <- read.csv("HOUST.csv", header = TRUE)
head(HOUST)
## observation_date HOUST
## 1 1975-10-01 296.6
## 2 1976-01-01 280.8
## 3 1976-04-01 439.3
## 4 1976-07-01 434.3
## 5 1976-10-01 382.9
## 6 1977-01-01 367.4
GDP <- read.csv("GDP.csv", header = TRUE)
head(GDP)
## DATE VALUE
## 1 1976-01-01 58.6
## 2 1976-04-01 32.4
## 3 1976-07-01 33.6
## 4 1976-10-01 47.9
## 5 1977-01-01 54.1
## 6 1977-04-01 67.7
CPI <- read.csv("CPI.csv", header = TRUE)
head(CPI)
## DATE VALUE
## 1 1976-01-01 0.633
## 2 1976-04-01 0.500
## 3 1976-07-01 0.900
## 4 1976-10-01 0.833
## 5 1977-01-01 1.067
## 6 1977-04-01 1.033
POP <- read.csv("POP.csv", header = TRUE)
head(POP)
## observation_date B230RC0Q173SBEA_CHG
## 1 1976-01-01 462
## 2 1976-04-01 562
## 3 1976-07-01 579
## 4 1976-10-01 510
## 5 1977-01-01 529
## 6 1977-04-01 617
HouseAll <- merge(x = CPI,y= GDP, by.x = "DATE" , by.y = "DATE" ,all="TRUE")
HouseAll = merge(x = HouseAll , y= HOUST , by.x = "DATE" , by.y= "observation_date", all = "TRUE")
HouseAll = merge(x=HouseAll, y =POP , by.x="DATE" , by.y= "observation_date", all="TRUE")
HouseAll
## DATE VALUE.x VALUE.y HOUST B230RC0Q173SBEA_CHG
## 1 1976-01-01 0.633 58.6 280.8 462
## 2 1976-04-01 0.500 32.4 439.3 562
## 3 1976-07-01 0.900 33.6 434.3 579
## 4 1976-10-01 0.833 47.9 382.9 510
## 5 1977-01-01 1.067 54.1 367.4 529
## 6 1977-04-01 1.033 67.7 581.1 617
## 7 1977-07-01 0.834 62.2 561.5 628
## 8 1977-10-01 0.900 46.3 477.1 518
## 9 1978-01-01 1.066 40.0 362.0 562
## 10 1978-04-01 1.434 127.9 624.5 652
## 11 1978-07-01 1.500 62.3 563.6 650
## 12 1978-10-01 1.533 83.3 470.2 569
## 13 1979-01-01 1.700 49.4 325.6 585
## 14 1979-04-01 2.200 64.3 541.9 681
## 15 1979-07-01 2.300 74.5 498.2 699
## 16 1979-10-01 2.333 60.3 379.3 637
## 17 1980-01-01 3.000 65.8 238.1 635
## 18 1980-04-01 2.667 3.4 304.3 681
## 19 1980-07-01 1.533 60.1 388.3 619
## 20 1980-10-01 2.334 133.5 361.5 466
## 21 1981-01-01 2.366 138.3 264.2 519
## 22 1981-04-01 1.834 35.5 338.7 627
## 23 1981-07-01 2.500 93.9 270.3 602
## 24 1981-10-01 1.500 22.3 210.9 492
## 25 1982-01-01 0.833 -9.7 176.7 511
## 26 1982-04-01 1.367 57.5 274.0 592
## 27 1982-07-01 1.666 35.8 309.2 576
## 28 1982-10-01 0.300 40.7 302.3 472
## 29 1983-01-01 0.067 72.5 322.2 482
## 30 1983-04-01 1.133 103.5 483.9 575
## 31 1983-07-01 0.967 108.5 493.3 550
## 32 1983-10-01 1.000 103.8 403.6 452
## 33 1984-01-01 1.433 116.7 376.6 477
## 34 1984-04-01 0.967 102.2 537.4 575
## 35 1984-07-01 0.900 72.4 458.0 575
## 36 1984-10-01 0.900 60.2 377.4 441
## 37 1985-01-01 0.967 89.4 345.8 503
## 38 1985-04-01 0.966 65.3 509.2 613
## 39 1985-07-01 0.667 92.3 469.1 598
## 40 1985-10-01 1.100 58.5 417.6 474
## 41 1986-01-01 0.567 63.2 373.8 507
## 42 1986-04-01 -0.534 38.9 558.4 594
## 43 1986-07-01 0.667 64.4 489.8 577
## 44 1986-10-01 0.767 49.8 383.4 470
## 45 1987-01-01 1.333 66.8 349.1 511
## 46 1987-04-01 1.267 85.3 480.2 600
## 47 1987-07-01 1.200 79.0 448.0 601
## 48 1987-10-01 1.066 122.2 343.3 487
## 49 1988-01-01 0.900 67.9 297.2 508
## 50 1988-04-01 1.334 117.1 443.6 638
## 51 1988-07-01 1.433 91.8 404.9 612
## 52 1988-10-01 1.300 113.2 342.3 494
## 53 1989-01-01 1.367 114.7 303.7 557
## 54 1989-04-01 1.966 101.0 404.3 681
## 55 1989-07-01 0.967 83.2 366.4 676
## 56 1989-10-01 1.267 51.8 301.7 562
## 57 1990-01-01 2.166 127.4 294.6 775
## 58 1990-04-01 1.267 83.9 357.9 884
## 59 1990-07-01 2.233 54.8 307.1 887
## 60 1990-10-01 2.234 -6.2 233.0 776
## 61 1991-01-01 1.000 31.6 185.4 805
## 62 1991-04-01 0.800 88.7 300.8 902
## 63 1991-07-01 1.033 74.8 284.8 870
## 64 1991-10-01 1.133 60.9 243.0 750
## 65 1992-01-01 0.934 101.5 262.0 854
## 66 1992-04-01 1.066 111.5 340.6 947
## 67 1992-07-01 1.067 94.2 322.1 891
## 68 1992-10-01 1.233 111.1 274.9 762
## 69 1993-01-01 1.034 50.6 240.6 787
## 70 1993-04-01 1.033 81.4 367.2 888
## 71 1993-07-01 0.667 74.6 355.6 833
## 72 1993-10-01 1.200 128.6 324.3 703
## 73 1994-01-01 0.733 103.5 294.0 770
## 74 1994-04-01 0.833 133.5 422.8 850
## 75 1994-07-01 1.367 82.5 397.7 808
## 76 1994-10-01 0.867 124.4 342.5 710
## 77 1995-01-01 1.100 68.6 269.9 754
## 78 1995-04-01 1.233 59.6 370.8 858
## 79 1995-07-01 0.767 101.6 387.3 820
## 80 1995-10-01 0.833 93.0 326.2 667
## 81 1996-01-01 1.367 93.6 302.6 764
## 82 1996-04-01 1.333 168.4 428.5 877
## 83 1996-07-01 0.900 97.5 410.4 863
## 84 1996-10-01 1.367 128.1 335.4 718
## 85 1997-01-01 0.966 115.0 297.3 778
## 86 1997-04-01 0.367 149.8 419.0 904
## 87 1997-07-01 0.800 139.9 400.3 855
## 88 1997-10-01 0.867 96.5 357.4 704
## 89 1998-01-01 0.333 101.4 324.9 753
## 90 1998-04-01 0.533 105.0 447.8 861
## 91 1998-07-01 0.834 151.8 445.0 836
## 92 1998-10-01 0.766 179.2 399.3 703
## 93 1999-01-01 0.600 121.4 364.3 761
## 94 1999-04-01 1.234 109.9 447.2 887
## 95 1999-07-01 1.233 155.3 445.8 841
## 96 1999-10-01 1.233 213.8 383.9 712
## 97 2000-01-01 1.667 104.9 357.1 698
## 98 2000-04-01 1.333 247.3 448.7 767
## 99 2000-07-01 1.567 79.1 405.3 749
## 100 2000-10-01 1.233 114.9 357.5 651
## 101 2001-01-01 1.667 35.8 347.8 669
## 102 2001-04-01 1.233 130.3 460.5 746
## 103 2001-07-01 0.500 1.1 429.2 727
## 104 2001-10-01 -0.133 61.8 365.4 624
## 105 2002-01-01 0.567 133.1 369.0 639
## 106 2002-04-01 1.400 100.4 474.6 729
## 107 2002-07-01 0.966 102.3 458.5 704
## 108 2002-10-01 1.067 66.7 402.9 602
## 109 2003-01-01 1.867 126.3 374.7 644
## 110 2003-04-01 -0.300 140.6 490.7 721
## 111 2003-07-01 1.366 254.4 510.9 695
## 112 2003-10-01 0.700 191.7 471.4 568
## 113 2004-01-01 1.567 171.6 424.7 638
## 114 2004-04-01 1.467 193.0 539.4 728
## 115 2004-07-01 1.200 186.3 531.9 731
## 116 2004-10-01 2.033 194.5 459.6 623
## 117 2005-01-01 0.967 251.5 448.2 631
## 118 2005-04-01 1.300 160.4 575.3 752
## 119 2005-07-01 2.933 231.3 567.5 746
## 120 2005-10-01 1.833 176.2 477.1 650
## 121 2006-01-01 1.034 267.3 464.0 672
## 122 2006-04-01 1.800 150.9 520.9 772
## 123 2006-07-01 1.900 108.7 457.8 766
## 124 2006-10-01 -0.834 157.9 358.2 663
## 125 2007-01-01 1.984 166.8 321.9 675
## 126 2007-04-01 2.314 189.1 409.9 778
## 127 2007-07-01 1.308 147.4 350.6 767
## 128 2007-10-01 2.551 115.6 272.7 665
## 129 2008-01-01 2.280 -16.9 231.4 666
## 130 2008-04-01 2.768 144.6 283.7 742
## 131 2008-07-01 3.323 30.0 237.0 714
## 132 2008-10-01 -5.012 -293.1 153.4 621
## 133 2009-01-01 -1.471 -166.0 114.4 629
## 134 2009-04-01 1.129 -43.5 153.8 707
## 135 2009-07-01 1.837 43.7 162.3 712
## 136 2009-10-01 1.686 182.4 123.4 616
## 137 2010-01-01 0.344 114.6 134.3 571
## 138 2010-04-01 -0.077 207.5 172.0 633
## 139 2010-07-01 0.637 169.1 160.8 646
## 140 2010-10-01 1.765 172.5 119.8 536
## 141 2011-01-01 2.345 8.2 125.5 542
## 142 2011-04-01 2.524 222.5 163.5 639
## 143 2011-07-01 1.465 126.2 170.9 622
## 144 2011-10-01 1.014 198.2 148.9 526
## 145 2012-01-01 1.330 188.6 154.9 530
## 146 2012-04-01 0.461 148.0 209.3 634
## 147 2012-07-01 0.931 106.0 214.0 638
## 148 2012-10-01 1.633 69.4 202.4 494
## 149 2013-01-01 0.918 178.1 208.1 521
## 150 2013-04-01 -0.284 66.0 244.2 652
## 151 2013-07-01 1.177 207.9 242.8 670
## 152 2013-10-01 1.072 250.6 229.8 546
## 153 2014-01-01 1.366 25.3 206.0 565
## 154 2014-04-01 1.120 260.4 274.7 678
## 155 2014-07-01 0.540 283.8 281.4 678
## 156 2014-10-01 -0.186 122.8 241.2 549
## 157 2015-01-01 -1.716 91.4 214.6 566
## 158 2015-04-01 1.423 214.7 320.4 678
## 159 2015-07-01 0.810 143.6 318.0 678
## 160 2015-10-01 0.455 80.9 258.9 549
## 161 2016-01-01 -0.186 58.8 NA NA
## 162 2016-04-01 NA 168.5 NA NA
## 163 2016-07-01 NA 207.8 NA NA
## 164 1975-10-01 NA NA 296.6 NA
## 165 NA NA NA NA
summary(HouseAll)
## DATE VALUE.x VALUE.y HOUST
## 1976-01-01: 1 Min. :-5.012 Min. :-293.10 Min. :114.4
## 1976-04-01: 1 1st Qu.: 0.833 1st Qu.: 62.25 1st Qu.:274.7
## 1976-07-01: 1 Median : 1.120 Median : 101.40 Median :357.4
## 1976-10-01: 1 Mean : 1.134 Mean : 103.63 Mean :351.6
## 1977-01-01: 1 3rd Qu.: 1.500 3rd Qu.: 142.10 3rd Qu.:439.3
## 1977-04-01: 1 Max. : 3.323 Max. : 283.80 Max. :624.5
## (Other) :159 NA's :4 NA's :2 NA's :4
## B230RC0Q173SBEA_CHG
## Min. :441.0
## 1st Qu.:574.0
## Median :650.5
## Mean :662.0
## 3rd Qu.:746.8
## Max. :947.0
## NA's :5
HouseAll <- na.omit(HouseAll)
quartdf = as.yearqtr(HouseAll$DATE, format = "%Y-%m-%d")
quartdf
## [1] "1976 Q1" "1976 Q2" "1976 Q3" "1976 Q4" "1977 Q1" "1977 Q2" "1977 Q3"
## [8] "1977 Q4" "1978 Q1" "1978 Q2" "1978 Q3" "1978 Q4" "1979 Q1" "1979 Q2"
## [15] "1979 Q3" "1979 Q4" "1980 Q1" "1980 Q2" "1980 Q3" "1980 Q4" "1981 Q1"
## [22] "1981 Q2" "1981 Q3" "1981 Q4" "1982 Q1" "1982 Q2" "1982 Q3" "1982 Q4"
## [29] "1983 Q1" "1983 Q2" "1983 Q3" "1983 Q4" "1984 Q1" "1984 Q2" "1984 Q3"
## [36] "1984 Q4" "1985 Q1" "1985 Q2" "1985 Q3" "1985 Q4" "1986 Q1" "1986 Q2"
## [43] "1986 Q3" "1986 Q4" "1987 Q1" "1987 Q2" "1987 Q3" "1987 Q4" "1988 Q1"
## [50] "1988 Q2" "1988 Q3" "1988 Q4" "1989 Q1" "1989 Q2" "1989 Q3" "1989 Q4"
## [57] "1990 Q1" "1990 Q2" "1990 Q3" "1990 Q4" "1991 Q1" "1991 Q2" "1991 Q3"
## [64] "1991 Q4" "1992 Q1" "1992 Q2" "1992 Q3" "1992 Q4" "1993 Q1" "1993 Q2"
## [71] "1993 Q3" "1993 Q4" "1994 Q1" "1994 Q2" "1994 Q3" "1994 Q4" "1995 Q1"
## [78] "1995 Q2" "1995 Q3" "1995 Q4" "1996 Q1" "1996 Q2" "1996 Q3" "1996 Q4"
## [85] "1997 Q1" "1997 Q2" "1997 Q3" "1997 Q4" "1998 Q1" "1998 Q2" "1998 Q3"
## [92] "1998 Q4" "1999 Q1" "1999 Q2" "1999 Q3" "1999 Q4" "2000 Q1" "2000 Q2"
## [99] "2000 Q3" "2000 Q4" "2001 Q1" "2001 Q2" "2001 Q3" "2001 Q4" "2002 Q1"
## [106] "2002 Q2" "2002 Q3" "2002 Q4" "2003 Q1" "2003 Q2" "2003 Q3" "2003 Q4"
## [113] "2004 Q1" "2004 Q2" "2004 Q3" "2004 Q4" "2005 Q1" "2005 Q2" "2005 Q3"
## [120] "2005 Q4" "2006 Q1" "2006 Q2" "2006 Q3" "2006 Q4" "2007 Q1" "2007 Q2"
## [127] "2007 Q3" "2007 Q4" "2008 Q1" "2008 Q2" "2008 Q3" "2008 Q4" "2009 Q1"
## [134] "2009 Q2" "2009 Q3" "2009 Q4" "2010 Q1" "2010 Q2" "2010 Q3" "2010 Q4"
## [141] "2011 Q1" "2011 Q2" "2011 Q3" "2011 Q4" "2012 Q1" "2012 Q2" "2012 Q3"
## [148] "2012 Q4" "2013 Q1" "2013 Q2" "2013 Q3" "2013 Q4" "2014 Q1" "2014 Q2"
## [155] "2014 Q3" "2014 Q4" "2015 Q1" "2015 Q2" "2015 Q3" "2015 Q4"
quarter <- format(quartdf, format="%q")
quarter
## [1] "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1"
## [18] "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2"
## [35] "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3"
## [52] "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4"
## [69] "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1"
## [86] "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2"
## [103] "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3"
## [120] "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4"
## [137] "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1"
## [154] "2" "3" "4" "1" "2" "3" "4"
HouseAll <- data.frame(HouseAll, quarter)
HouseAll$quarter <- factor(HouseAll$quarter)
levels(HouseAll$quarter) <- c("Q1","Q2","Q3","Q4")
names(HouseAll) <- c("DATE", "CPIValue", "GDPValue", "HOUSTValue", "POPValue", "quarter")
names(HouseAll)
## [1] "DATE" "CPIValue" "GDPValue" "HOUSTValue" "POPValue"
## [6] "quarter"
summary(HouseAll)
## DATE CPIValue GDPValue HOUSTValue
## 1976-01-01: 1 Min. :-5.012 Min. :-293.10 Min. :114.4
## 1976-04-01: 1 1st Qu.: 0.833 1st Qu.: 62.27 1st Qu.:274.5
## 1976-07-01: 1 Median : 1.125 Median : 101.20 Median :357.4
## 1976-10-01: 1 Mean : 1.143 Mean : 102.86 Mean :351.9
## 1977-01-01: 1 3rd Qu.: 1.500 3rd Qu.: 140.07 3rd Qu.:440.4
## 1977-04-01: 1 Max. : 3.323 Max. : 283.80 Max. :624.5
## (Other) :154
## POPValue quarter
## Min. :441.0 Q1:40
## 1st Qu.:574.0 Q2:40
## Median :650.5 Q3:40
## Mean :662.0 Q4:40
## 3rd Qu.:746.8
## Max. :947.0
##
# FIRST MODEL:
housemod <- lm(HouseAll$HOUSTValue ~ HouseAll$GDPValue + HouseAll$CPIValue + HouseAll$quarter)
sumary(housemod)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 271.06863 20.91476 12.9606 < 2.2e-16
## HouseAll$GDPValue 0.22075 0.12069 1.8290 0.0693276
## HouseAll$CPIValue 1.84682 9.83017 0.1879 0.8512239
## HouseAll$quarterQ2 105.33630 23.53808 4.4751 1.477e-05
## HouseAll$quarterQ3 88.28523 23.45485 3.7641 0.0002373
## HouseAll$quarterQ4 30.49725 23.42023 1.3022 0.1948005
##
## n = 160, p = 6, Residual SE = 104.40269, R-Squared = 0.18
summary(housemod)
##
## Call:
## lm(formula = HouseAll$HOUSTValue ~ HouseAll$GDPValue + HouseAll$CPIValue +
## HouseAll$quarter)
##
## Residuals:
## Min 1Q Median 3Q Max
## -266.68 -69.19 12.62 71.28 217.21
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 271.0686 20.9148 12.961 < 2e-16 ***
## HouseAll$GDPValue 0.2208 0.1207 1.829 0.069328 .
## HouseAll$CPIValue 1.8468 9.8302 0.188 0.851224
## HouseAll$quarterQ2 105.3363 23.5381 4.475 1.48e-05 ***
## HouseAll$quarterQ3 88.2852 23.4548 3.764 0.000237 ***
## HouseAll$quarterQ4 30.4973 23.4202 1.302 0.194801
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 104.4 on 154 degrees of freedom
## Multiple R-squared: 0.1782, Adjusted R-squared: 0.1515
## F-statistic: 6.677 on 5 and 154 DF, p-value: 1.173e-05
# Q. (2) Use one-way ANOVA to determine whether there’s a seasonal effect. Show necessary steps and explanation.
require(ggplot2)
ggplot(HouseAll, aes(x = HouseAll$quarter, y = HouseAll$HOUSTValue )) +
geom_boxplot(fill = "grey80", colour = "blue") +
scale_x_discrete() + xlab("Quarter Distribution") +
ylab("New Privately Owned Housing Units Started")

# Initial inspection of the data suggests that there are differences in the housing units start counts for the quarters; but it is not so clear cut to determine whether the quarterly data is different within itself.
# To investigate these differences we fit the one-way ANOVA model using the lm function and look at the parameter estimates and standard errors for the treatment effects. The function call is:
housemod1 <- lm(HouseAll$HOUSTValue ~ HouseAll$quarter, data = HouseAll)
# We save the model fitted to the data in an object so that we can undertake various actions to study the goodness of the fit to the data and other model assumptions. The standard summary of a lm object is used to produce the following output:
summary(housemod1)
##
## Call:
## lm(formula = HouseAll$HOUSTValue ~ HouseAll$quarter, data = HouseAll)
##
## Residuals:
## Min 1Q Median 3Q Max
## -250.32 -76.57 17.00 72.60 220.38
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 292.88 16.61 17.633 < 2e-16 ***
## HouseAll$quarterQ2 111.24 23.49 4.736 4.87e-06 ***
## HouseAll$quarterQ3 92.36 23.49 3.932 0.000126 ***
## HouseAll$quarterQ4 32.51 23.49 1.384 0.168265
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 105 on 156 degrees of freedom
## Multiple R-squared: 0.1572, Adjusted R-squared: 0.1409
## F-statistic: 9.696 on 3 and 156 DF, p-value: 6.625e-06
round(coef(housemod1),1)
## (Intercept) HouseAll$quarterQ2 HouseAll$quarterQ3
## 292.9 111.2 92.4
## HouseAll$quarterQ4
## 32.5
round(coef(housemod))
## (Intercept) HouseAll$GDPValue HouseAll$CPIValue
## 271 0 2
## HouseAll$quarterQ2 HouseAll$quarterQ3 HouseAll$quarterQ4
## 105 88 30
# Q1 is the reference level and has a mean of 271, Q2,3,4 are 105, 88 and 30 units larger, respectively on an average.
# The model output indicates some evidence of a difference in the average count for the quarter groups. An analysis of variance table for this model can be produced via the anova command:
anova(housemod)
## Analysis of Variance Table
##
## Response: HouseAll$HOUSTValue
## Df Sum Sq Mean Sq F value Pr(>F)
## HouseAll$GDPValue 1 76745 76745 7.0408 0.008803 **
## HouseAll$CPIValue 1 2025 2025 0.1858 0.667068
## HouseAll$quarter 3 285122 95041 8.7194 2.229e-05 ***
## Residuals 154 1678588 10900
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# This table confirms that there are differences between the groups which were highlighted in the model summary. The function confint is used to calculate confidence intervals on the treatment parameters, by default 95% confidence intervals:
confint(housemod)
## 2.5 % 97.5 %
## (Intercept) 229.75176751 312.3854828
## HouseAll$GDPValue -0.01767501 0.4591816
## HouseAll$CPIValue -17.57257332 21.2662125
## HouseAll$quarterQ2 58.83709948 151.8354953
## HouseAll$quarterQ3 41.95045749 134.6200077
## HouseAll$quarterQ4 -15.76912706 76.7636333
house.mod = data.frame(Fitted = fitted(housemod),
Residuals = resid(housemod), Quarters = HouseAll$quarter)
# and then produce the plot:
ggplot(house.mod, aes(Fitted, Residuals, colour = Quarters)) + geom_point()

# There is some evidence of different variabilities in the spread of the residuals for the quarters.
# Thus, we could determine using one way ANOVA that there is seasonal effect on the New Privately Owned Housing Units Start counts.
# Q. (3) Do pair-wise comparison for different levels. In particular, construct %90 confi- dence intervals for the pairwise differences.
aov(housemod)
## Call:
## aov(formula = housemod)
##
## Terms:
## HouseAll$GDPValue HouseAll$CPIValue HouseAll$quarter
## Sum of Squares 76744.6 2024.8 285121.6
## Deg. of Freedom 1 1 3
## Residuals
## Sum of Squares 1678588.0
## Deg. of Freedom 154
##
## Residual standard error: 104.4027
## Estimated effects may be unbalanced
results <- aov(housemod)
houseconf <- TukeyHSD(results, conf.level = 0.90)
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## HouseAll$GDPValue
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## HouseAll$CPIValue
## Warning in TukeyHSD.aov(results, conf.level = 0.9): 'which' specified some
## non-factors which will be dropped
houseconf
## Tukey multiple comparisons of means
## 90% family-wise confidence level
##
## Fit: aov(formula = housemod)
##
## $`HouseAll$quarter`
## diff lwr upr p adj
## Q2-Q1 103.40958 49.46254 157.356616 0.0001042
## Q3-Q1 86.73156 32.78452 140.678595 0.0016008
## Q4-Q1 30.09146 -23.85557 84.038500 0.5713311
## Q3-Q2 -16.67802 -70.62506 37.269016 0.8912549
## Q4-Q2 -73.31812 -127.26515 -19.371079 0.0107807
## Q4-Q3 -56.64009 -110.58713 -2.693057 0.0764098
# The intervals can be plotted as seen below:
plot(houseconf)

# Q. (4) Add population to the first model, do the above 2 steps again.
housemod2 <- lm(HouseAll$HOUSTValue ~ HouseAll$GDPValue + HouseAll$CPIValue + HouseAll$quarter + HouseAll$POPValue)
summary(housemod2)
##
## Call:
## lm(formula = HouseAll$HOUSTValue ~ HouseAll$GDPValue + HouseAll$CPIValue +
## HouseAll$quarter + HouseAll$POPValue)
##
## Residuals:
## Min 1Q Median 3Q Max
## -271.25 -64.11 15.78 70.93 213.90
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 299.00784 53.40285 5.599 9.73e-08 ***
## HouseAll$GDPValue 0.22720 0.12149 1.870 0.06337 .
## HouseAll$CPIValue 1.88789 9.85210 0.192 0.84829
## HouseAll$quarterQ2 109.61624 24.76083 4.427 1.81e-05 ***
## HouseAll$quarterQ3 91.91961 24.35938 3.773 0.00023 ***
## HouseAll$quarterQ4 28.98273 23.62236 1.227 0.22174
## HouseAll$POPValue -0.04569 0.08032 -0.569 0.57031
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 104.6 on 153 degrees of freedom
## Multiple R-squared: 0.1799, Adjusted R-squared: 0.1477
## F-statistic: 5.594 on 6 and 153 DF, p-value: 2.852e-05
round(coef(housemod2))
## (Intercept) HouseAll$GDPValue HouseAll$CPIValue
## 299 0 2
## HouseAll$quarterQ2 HouseAll$quarterQ3 HouseAll$quarterQ4
## 110 92 29
## HouseAll$POPValue
## 0
# 4.1) is the reference level and has a mean of 299, Q2,3,4 are 110, 92 and 29 units larger, respectively on an average.
# The model output indicates some evidence of a difference in the average count for the quarter groups. An analysis of variance table for this model can be produced via the anova command:
anova(housemod2)
## Analysis of Variance Table
##
## Response: HouseAll$HOUSTValue
## Df Sum Sq Mean Sq F value Pr(>F)
## HouseAll$GDPValue 1 76745 76745 7.0099 0.008955 **
## HouseAll$CPIValue 1 2025 2025 0.1849 0.667759
## HouseAll$quarter 3 285122 95041 8.6811 2.35e-05 ***
## HouseAll$POPValue 1 3542 3542 0.3236 0.570308
## Residuals 153 1675046 10948
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# This table confirms that there are differences between the groups which were highlighted in the model summary. The function confint is used to calculate confidence intervals on the treatment parameters, by default 95% confidence intervals:
confint(housemod2)
## 2.5 % 97.5 %
## (Intercept) 193.50569403 404.5099869
## HouseAll$GDPValue -0.01280996 0.4672180
## HouseAll$CPIValue -17.57582059 21.3516062
## HouseAll$quarterQ2 60.69898924 158.5334880
## HouseAll$quarterQ3 43.79544620 140.0437679
## HouseAll$quarterQ4 -17.68538422 75.6508442
## HouseAll$POPValue -0.20437938 0.1129978
house.mod = data.frame(Fitted = fitted(housemod),
Residuals = resid(housemod), Quarters = HouseAll$quarter)
# and then produce the plot:
ggplot(house.mod, aes(Fitted, Residuals, colour = Quarters)) + geom_point()

# There is some evidence of different variabilities in the spread of the residuals for the quarters.
# Thus, we could determine using one way ANOVA that there is seasonal effect on the New Privately Owned Housing Units Start counts.
# Q. (3) Do pair-wise comparison for different levels. In particular, construct %90 confi- dence intervals for the pairwise differences.
aov(housemod2)
## Call:
## aov(formula = housemod2)
##
## Terms:
## HouseAll$GDPValue HouseAll$CPIValue HouseAll$quarter
## Sum of Squares 76744.6 2024.8 285121.6
## Deg. of Freedom 1 1 3
## HouseAll$POPValue Residuals
## Sum of Squares 3542.4 1675045.6
## Deg. of Freedom 1 153
##
## Residual standard error: 104.6327
## Estimated effects may be unbalanced
results <- aov(housemod2)
houseconf <- TukeyHSD(results, conf.level = 0.90)
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## HouseAll$GDPValue
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## HouseAll$CPIValue
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## HouseAll$POPValue
## Warning in TukeyHSD.aov(results, conf.level = 0.9): 'which' specified some
## non-factors which will be dropped
houseconf
## Tukey multiple comparisons of means
## 90% family-wise confidence level
##
## Fit: aov(formula = housemod2)
##
## $`HouseAll$quarter`
## diff lwr upr p adj
## Q2-Q1 103.40958 49.34067 157.478488 0.0001089
## Q3-Q1 86.73156 32.66265 140.800466 0.0016512
## Q4-Q1 30.09146 -23.97745 84.160372 0.5731405
## Q3-Q2 -16.67802 -70.74693 37.390888 0.8918810
## Q4-Q2 -73.31812 -127.38703 -19.249207 0.0110230
## Q4-Q3 -56.64009 -110.70900 -2.571186 0.0774173
# The intervals can be plotted as seen below:
plot(houseconf)

# Since all unfactored variables are ignored, also the addition of population to the first model, except for making minute quarterly distribution changes, doesn't make any different significantly.
## -- 2. Transforming data, diagnostics and generalized regression -- ##
House_Prices = read.csv("house_prices.csv", header = TRUE)
summary(House_Prices)
## ID Sales.Price home.ft2 lot.ft2
## Min. : 1.0 Min. : 84000 Min. : 980 Min. : 4560
## 1st Qu.:131.2 1st Qu.:180000 1st Qu.:1701 1st Qu.:17205
## Median :261.5 Median :229900 Median :2061 Median :22200
## Mean :261.5 Mean :277894 Mean :2261 Mean :24370
## 3rd Qu.:391.8 3rd Qu.:335000 3rd Qu.:2636 3rd Qu.:26787
## Max. :522.0 Max. :920000 Max. :5032 Max. :86830
## NA's :522 NA's :522 NA's :522 NA's :522
head(House_Prices)
## ID Sales.Price home.ft2 lot.ft2
## 1 NA NA NA NA
## 2 1 360000 3032 22221
## 3 NA NA NA NA
## 4 2 340000 2058 22912
## 5 NA NA NA NA
## 6 3 250000 1780 21345
# Q. (1) Clean the data by omitting NA values.
House_Prices <- na.omit(House_Prices)
summary(House_Prices)
## ID Sales.Price home.ft2 lot.ft2
## Min. : 1.0 Min. : 84000 Min. : 980 Min. : 4560
## 1st Qu.:131.2 1st Qu.:180000 1st Qu.:1701 1st Qu.:17205
## Median :261.5 Median :229900 Median :2061 Median :22200
## Mean :261.5 Mean :277894 Mean :2261 Mean :24370
## 3rd Qu.:391.8 3rd Qu.:335000 3rd Qu.:2636 3rd Qu.:26787
## Max. :522.0 Max. :920000 Max. :5032 Max. :86830
# Q. (2) Plot the distribution of X1 and X2 against the normal distribution. In other words,
# form a Q-Q plot for both of the variables X1 and X2. Does this distribution have
# heavy tail or not?
qplot(sample = House_Prices$home.ft2, data = House_Prices) + geom_abline(intercept = 0, slope = 1, color="hot pink")

qplot(sample = House_Prices$lot.ft2, data = House_Prices) + geom_abline(intercept = 0, slope = 1, color="hot pink")

# A heavy tail means that this side of the distribution produces outliers at a greater rate than you would expect from a normal distribution.
# Notice the points fall along a line in the middle of the graph, but curve off in the extremities.
# Normal Q-Q plots that exhibit this behavior usually mean the data has more extreme values than would be expected if they truly came from a Normal distribution.
# Thus, this distribution have heavy tail.
ggplot( House_Prices, aes( x = House_Prices$home.ft2, y = ..density..)) +
geom_histogram( fill =" cornsilk", colour =" grey60", size =.2, binwidth=3) + geom_density()

ggplot( House_Prices, aes( x = House_Prices$lot.ft2, y =..density..)) +
geom_histogram( fill =" cornsilk", colour =" grey60", size =.2, binwidth=3) + geom_density()

# Q. (3)
names(House_Prices) <- c("ID","Y","X1", "X2")
Z1 = log(House_Prices$X1)
Z2 = log(House_Prices$X2)
Z3 = sqrt(House_Prices$X1)
Z4 = sqrt(House_Prices$X2)
logfit <- lm(Y ~ Z1 + Z2, House_Prices)
sqrtfit <- lm(Y ~ Z3 + Z4, House_Prices)
realfit <- lm(Y ~ X1 + X2, House_Prices)
summary(logfit)
##
## Call:
## lm(formula = Y ~ Z1 + Z2, data = House_Prices)
##
## Residuals:
## Min 1Q Median 3Q Max
## -215228 -49365 -4868 30572 416630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2861006 114216 -25.05 < 2e-16 ***
## Z1 369395 12415 29.75 < 2e-16 ***
## Z2 30202 8805 3.43 0.000651 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 80910 on 519 degrees of freedom
## Multiple R-squared: 0.6572, Adjusted R-squared: 0.6559
## F-statistic: 497.6 on 2 and 519 DF, p-value: < 2.2e-16
summary(sqrtfit)
##
## Call:
## lm(formula = Y ~ Z3 + Z4, data = House_Prices)
##
## Residuals:
## Min 1Q Median 3Q Max
## -222990 -42893 -3130 28052 397902
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -507552.5 25835.9 -19.645 < 2e-16 ***
## Z3 15439.2 494.2 31.240 < 2e-16 ***
## Z4 391.0 106.4 3.674 0.000263 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78840 on 519 degrees of freedom
## Multiple R-squared: 0.6745, Adjusted R-squared: 0.6733
## F-statistic: 537.8 on 2 and 519 DF, p-value: < 2.2e-16
summary(realfit)
##
## Call:
## lm(formula = Y ~ X1 + X2, data = House_Prices)
##
## Residuals:
## Min 1Q Median 3Q Max
## -228421 -38178 -5506 25494 383423
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.027e+05 1.265e+04 -8.121 3.39e-15 ***
## X1 1.560e+02 4.871e+00 32.019 < 2e-16 ***
## X2 1.151e+00 2.964e-01 3.882 0.000117 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78070 on 519 degrees of freedom
## Multiple R-squared: 0.6808, Adjusted R-squared: 0.6796
## F-statistic: 553.5 on 2 and 519 DF, p-value: < 2.2e-16
modtrans1 = fortify(logfit)
modtrans2 = fortify(sqrtfit)
moduntrans = fortify(realfit)
p1 <- qplot(sample = scale(.resid), data = moduntrans) + geom_abline(intercept = 0,
slope = 1, color = "red") + labs(title = "Untransformed y", y = "Residuals")
p2 <- qplot(sample = scale(.resid), data = modtrans1) + geom_abline(intercept = 0,
slope = 1, color = "red") + labs(title = "Log-Tranformed y", y = "Residuals")
p3 <- qplot(sample = scale(.resid), data = modtrans2) + geom_abline(intercept = 0,
slope = 1, color = "red") + labs(title = "Sqrt-Tranformed y", y = "Residuals")
grid.arrange(p1, p2, p3, nrow = 3)

p1 <- qplot(scale(.resid), data = moduntrans, geom = "blank") + geom_line(aes(y = ..density..,
colour = "Empirical"), stat = "density") + stat_function(fun = dnorm, aes(colour = "Normal")) +
geom_histogram(aes(y = ..density..), alpha = 0.4) + scale_colour_manual(name = "Density",
values = c("red", "blue")) + theme(legend.position = c(0.85, 0.85)) + labs(title = "Untransformed y",
y = "Residuals")
p2 <- qplot(scale(.resid), data = modtrans1, geom = "blank") + geom_line(aes(y = ..density..,
colour = "Empirical"), stat = "density") + stat_function(fun = dnorm, aes(colour = "Normal")) +
geom_histogram(aes(y = ..density..), alpha = 0.4) + scale_colour_manual(name = "Density",
values = c("red", "blue")) + theme(legend.position = c(0.85, 0.85)) + labs(title = "Log-Tranformed y",
y = "Residuals")
p3 <- qplot(scale(.resid), data = modtrans2, geom = "blank") + geom_line(aes(y = ..density..,
colour = "Empirical"), stat = "density") + stat_function(fun = dnorm, aes(colour = "Normal")) +
geom_histogram(aes(y = ..density..), alpha = 0.4) + scale_colour_manual(name = "Density",
values = c("red", "blue")) + theme(legend.position = c(0.85, 0.85)) + labs(title = "Sqrt-Tranformed y",
y = "Residuals")
grid.arrange(p1, p2, p3, nrow = 3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

boxcox(logfit, plotit=T)

boxcox(sqrtfit, plotit=T)

boxcox(realfit, plotit=T)

# From all the analysis done above, The transformed models looks better to me in comparison to the original model. Out of which, the sqrt-transform fits well on our data than the log-transform. Also, R-square value is better for sqrt-transformed model.
# Clearly the residuals of the untransformed model are not normal. The residuals of the sqrt-tranformed model look much better.
# Q. (4)
# 4.a. Plot norm of the residuals to see if they look similar in terms of magnitude.
p1 <- qplot(.fitted, .resid, data = modtrans1) + geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Residuals vs Fitted for Log Transform", x = "Fitted", y = "Residuals") + geom_smooth(color = "red",
se = F)
p2 <- qplot(.fitted, abs(.resid), data = modtrans1) + geom_hline(yintercept = 0,
linetype = "dashed") + labs(title = "Scale-Location", x = "Fitted", y = "|Residuals|") + geom_smooth(method = "lm", color = "red", se = F)
p3 <- qplot(.fitted, .resid, data = modtrans2) + geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Residuals vs Fitted for Sqrt Transform", x = "Fitted", y = "Residuals") + geom_smooth(color = "red", se = F)
p4 <- qplot(.fitted, abs(.resid), data = modtrans2) + geom_hline(yintercept = 0,
linetype = "dashed") + labs(title = "Scale-Location", x = "Fitted", y = "|Residuals|") + geom_smooth(method = "lm", color = "red", se = F)
grid.arrange(p1, p2, p3, p4, nrow = 2)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'

# Here we use the Residual vs Fitted plot to detect patterns in residuals that would indicate nonconstant error variance.
# The second plot is called the Scale-Location plot, which strengthens the pattern in the residuals by ploting the absolute values.
# In both the plots, we see some evidence of heterskedasticity, in other words nonconstant error variance.
# Though, there is similarity in the normality for residuals for both the transforms, both differ very slighlty in terms of values in the plot.
# 4.b.
group_A = sqldf('select *, "A" as Group_type from House_Prices where ID >=1 and ID <= 130')
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
group_B = sqldf('select *, "B" as Group_type from House_Prices where ID >=131 and ID <= 260')
group_C = sqldf('select *, "C" as Group_type from House_Prices where ID >=261 and ID <= 390')
group_D = sqldf('select *, "D" as Group_type from House_Prices where ID >=391 and ID <= 522')
House_Prices <- rbind(group_A,group_B,group_C,group_D)
summary(House_Prices)
## ID Y X1 X2
## Min. : 1.0 Min. : 84000 Min. : 980 Min. : 4560
## 1st Qu.:131.2 1st Qu.:180000 1st Qu.:1701 1st Qu.:17205
## Median :261.5 Median :229900 Median :2061 Median :22200
## Mean :261.5 Mean :277894 Mean :2261 Mean :24370
## 3rd Qu.:391.8 3rd Qu.:335000 3rd Qu.:2636 3rd Qu.:26787
## Max. :522.0 Max. :920000 Max. :5032 Max. :86830
## Group_type
## Length:522
## Class :character
## Mode :character
##
##
##
House_Prices$ID = factor(House_Prices$ID)
House_Prices$Group_type = factor(House_Prices$Group_type)
levels(House_Prices$Group_type)
## [1] "A" "B" "C" "D"
summary(House_Prices)
## ID Y X1 X2 Group_type
## 1 : 1 Min. : 84000 Min. : 980 Min. : 4560 A:130
## 2 : 1 1st Qu.:180000 1st Qu.:1701 1st Qu.:17205 B:130
## 3 : 1 Median :229900 Median :2061 Median :22200 C:130
## 4 : 1 Mean :277894 Mean :2261 Mean :24370 D:132
## 5 : 1 3rd Qu.:335000 3rd Qu.:2636 3rd Qu.:26787
## 6 : 1 Max. :920000 Max. :5032 Max. :86830
## (Other):516
log.resid = lm(resid(logfit) ~ House_Prices$Group_type)
summary(log.resid)
##
## Call:
## lm(formula = resid(logfit) ~ House_Prices$Group_type)
##
## Residuals:
## Min 1Q Median 3Q Max
## -255325 -42703 -388 35046 376533
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40097 6756 5.935 5.38e-09 ***
## House_Prices$Group_typeB -49892 9554 -5.222 2.57e-07 ***
## House_Prices$Group_typeC -68025 9554 -7.120 3.63e-12 ***
## House_Prices$Group_typeD -42435 9518 -4.458 1.01e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 77030 on 518 degrees of freedom
## Multiple R-squared: 0.09525, Adjusted R-squared: 0.09001
## F-statistic: 18.18 on 3 and 518 DF, p-value: 3.143e-11
sqrt.resid = lm(resid(sqrtfit) ~ House_Prices$Group_type)
summary(sqrt.resid)
##
## Call:
## lm(formula = resid(sqrtfit) ~ House_Prices$Group_type)
##
## Residuals:
## Min 1Q Median 3Q Max
## -258881 -37979 1340 33907 362011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35891 6648 5.399 1.02e-07 ***
## House_Prices$Group_typeB -43134 9402 -4.588 5.63e-06 ***
## House_Prices$Group_typeC -59333 9402 -6.311 5.97e-10 ***
## House_Prices$Group_typeD -41019 9366 -4.379 1.44e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75800 on 518 degrees of freedom
## Multiple R-squared: 0.07725, Adjusted R-squared: 0.07191
## F-statistic: 14.46 on 3 and 518 DF, p-value: 4.674e-09
summary(lm(abs(residuals(log.resid)) ~ fitted(log.resid)))
##
## Call:
## lm(formula = abs(residuals(log.resid)) ~ fitted(log.resid))
##
## Residuals:
## Min 1Q Median 3Q Max
## -89478 -33747 -8340 24605 286204
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.489e+04 2.144e+03 25.60 <2e-16 ***
## fitted(log.resid) 8.838e-01 8.612e-02 10.26 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48990 on 520 degrees of freedom
## Multiple R-squared: 0.1684, Adjusted R-squared: 0.1668
## F-statistic: 105.3 on 1 and 520 DF, p-value: < 2.2e-16
summary(lm(abs(residuals(sqrt.resid)) ~ fitted(sqrt.resid)))
##
## Call:
## lm(formula = abs(residuals(sqrt.resid)) ~ fitted(sqrt.resid))
##
## Residuals:
## Min 1Q Median 3Q Max
## -90773 -31545 -8938 21048 271102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.325e+04 2.122e+03 25.09 <2e-16 ***
## fitted(sqrt.resid) 1.049e+00 9.711e-02 10.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48480 on 520 degrees of freedom
## Multiple R-squared: 0.1834, Adjusted R-squared: 0.1818
## F-statistic: 116.8 on 1 and 520 DF, p-value: < 2.2e-16
shapiro.test(residuals(log.resid))
##
## Shapiro-Wilk normality test
##
## data: residuals(log.resid)
## W = 0.95624, p-value = 2.511e-11
shapiro.test(residuals(sqrt.resid))
##
## Shapiro-Wilk normality test
##
## data: residuals(sqrt.resid)
## W = 0.9503, p-value = 3.052e-12
qqnorm(residuals(log.resid))
qqline(residuals(log.resid))

plot(jitter(fitted(log.resid)),residuals(log.resid),xlab="Fitted",ylab="
Residuals")
abline(h=0)

# From, Shapiro-Wilk Test, the null hypothesis is that the the residuals are normal. Since the p-value is very less, we reject this hypothesis.
# Both the residuals of the tranformations show non-normal residuals.
# From both our tranformed models and their respective residuals; also from their respective plots, we can easily conclude that there is no homogeneity of the error variance, i.e. there is non-constant variance.
# 4.c.
bartlett.test(resid(logfit) ~ House_Prices$Group_type)
##
## Bartlett test of homogeneity of variances
##
## data: resid(logfit) by House_Prices$Group_type
## Bartlett's K-squared = 175.06, df = 3, p-value < 2.2e-16
bartlett.test(resid(sqrtfit) ~ House_Prices$Group_type)
##
## Bartlett test of homogeneity of variances
##
## data: resid(sqrtfit) by House_Prices$Group_type
## Bartlett's K-squared = 201.8, df = 3, p-value < 2.2e-16
# Since the p-value is very small, we conclude that there is evidence of a non-constant variance for both the tranformations.
## Hence, we have proved from all the three methods above that both the transformed models have constant variance.
# Q. (5)
# Since we have dependent errors in the transformed data, we use Generalized Least Squares technique.
logg1 <- gls(Y ~ Z1 + Z2, House_Prices)
summary(logg1)
## Generalized least squares fit by REML
## Model: Y ~ Z1 + Z2
## Data: House_Prices
## AIC BIC logLik
## 13225.83 13242.84 -6608.916
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -2861006.5 114216.37 -25.049007 0e+00
## Z1 369395.0 12415.14 29.753592 0e+00
## Z2 30202.3 8805.19 3.430062 7e-04
##
## Correlation:
## (Intr) Z1
## Z1 -0.659
## Z2 -0.582 -0.228
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.66023596 -0.61015373 -0.06016272 0.37787308 5.14958080
##
## Residual standard error: 80905.68
## Degrees of freedom: 522 total; 519 residual
sqrtg2 <- gls(Y ~ Z3 + Z4, House_Prices)
summary(sqrtg2)
## Generalized least squares fit by REML
## Model: Y ~ Z3 + Z4
## Data: House_Prices
## AIC BIC logLik
## 13214.09 13231.1 -6603.046
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -507552.5 25835.945 -19.645209 0e+00
## Z3 15439.2 494.214 31.239946 0e+00
## Z4 391.0 106.411 3.674232 3e-04
##
## Correlation:
## (Intr) Z3
## Z3 -0.776
## Z4 -0.452 -0.196
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.8285434 -0.5440775 -0.0397021 0.3558287 5.0472318
##
## Residual standard error: 78835.72
## Degrees of freedom: 522 total; 519 residual
# Since, the Residual standard errors are very bad for the GLS model, i.e for both the transformed models,
# we can say that the fit done by the generalized least squares technique worsens the tansformed data.