library("faraway")
library("car")
## 
## Attaching package: 'car'
## The following objects are masked from 'package:faraway':
## 
##     logit, vif
library("ggplot2")
library("gridExtra")
library("sqldf")
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: /Library/Frameworks/R.framework/Resources/modules//R_X11.so
##   Reason: image not found
## Could not load tcltk.  Will use slower R code instead.
## Loading required package: RSQLite
library("nlme")
library("zoo")
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library("MASS")

require("MASS")
require("faraway")
require("car")
require("ggplot2")
require("gridExtra")
require("sqldf")
require("nlme")
require("zoo")

setwd("/Users/prashant0268/Desktop/")

## -- 1. Anova Models -- ##

HOUST <- read.csv("HOUST.csv", header = TRUE)

head(HOUST)
##   observation_date HOUST
## 1       1975-10-01 296.6
## 2       1976-01-01 280.8
## 3       1976-04-01 439.3
## 4       1976-07-01 434.3
## 5       1976-10-01 382.9
## 6       1977-01-01 367.4
GDP <- read.csv("GDP.csv", header = TRUE)

head(GDP)
##         DATE VALUE
## 1 1976-01-01  58.6
## 2 1976-04-01  32.4
## 3 1976-07-01  33.6
## 4 1976-10-01  47.9
## 5 1977-01-01  54.1
## 6 1977-04-01  67.7
CPI <- read.csv("CPI.csv", header = TRUE)

head(CPI)
##         DATE VALUE
## 1 1976-01-01 0.633
## 2 1976-04-01 0.500
## 3 1976-07-01 0.900
## 4 1976-10-01 0.833
## 5 1977-01-01 1.067
## 6 1977-04-01 1.033
POP <- read.csv("POP.csv", header = TRUE)

head(POP)
##   observation_date B230RC0Q173SBEA_CHG
## 1       1976-01-01                 462
## 2       1976-04-01                 562
## 3       1976-07-01                 579
## 4       1976-10-01                 510
## 5       1977-01-01                 529
## 6       1977-04-01                 617
HouseAll <- merge(x = CPI,y= GDP, by.x = "DATE" , by.y = "DATE" ,all="TRUE")

HouseAll = merge(x = HouseAll , y= HOUST ,  by.x = "DATE" , by.y= "observation_date", all = "TRUE")

HouseAll = merge(x=HouseAll, y =POP , by.x="DATE" , by.y= "observation_date", all="TRUE")

HouseAll
##           DATE VALUE.x VALUE.y HOUST B230RC0Q173SBEA_CHG
## 1   1976-01-01   0.633    58.6 280.8                 462
## 2   1976-04-01   0.500    32.4 439.3                 562
## 3   1976-07-01   0.900    33.6 434.3                 579
## 4   1976-10-01   0.833    47.9 382.9                 510
## 5   1977-01-01   1.067    54.1 367.4                 529
## 6   1977-04-01   1.033    67.7 581.1                 617
## 7   1977-07-01   0.834    62.2 561.5                 628
## 8   1977-10-01   0.900    46.3 477.1                 518
## 9   1978-01-01   1.066    40.0 362.0                 562
## 10  1978-04-01   1.434   127.9 624.5                 652
## 11  1978-07-01   1.500    62.3 563.6                 650
## 12  1978-10-01   1.533    83.3 470.2                 569
## 13  1979-01-01   1.700    49.4 325.6                 585
## 14  1979-04-01   2.200    64.3 541.9                 681
## 15  1979-07-01   2.300    74.5 498.2                 699
## 16  1979-10-01   2.333    60.3 379.3                 637
## 17  1980-01-01   3.000    65.8 238.1                 635
## 18  1980-04-01   2.667     3.4 304.3                 681
## 19  1980-07-01   1.533    60.1 388.3                 619
## 20  1980-10-01   2.334   133.5 361.5                 466
## 21  1981-01-01   2.366   138.3 264.2                 519
## 22  1981-04-01   1.834    35.5 338.7                 627
## 23  1981-07-01   2.500    93.9 270.3                 602
## 24  1981-10-01   1.500    22.3 210.9                 492
## 25  1982-01-01   0.833    -9.7 176.7                 511
## 26  1982-04-01   1.367    57.5 274.0                 592
## 27  1982-07-01   1.666    35.8 309.2                 576
## 28  1982-10-01   0.300    40.7 302.3                 472
## 29  1983-01-01   0.067    72.5 322.2                 482
## 30  1983-04-01   1.133   103.5 483.9                 575
## 31  1983-07-01   0.967   108.5 493.3                 550
## 32  1983-10-01   1.000   103.8 403.6                 452
## 33  1984-01-01   1.433   116.7 376.6                 477
## 34  1984-04-01   0.967   102.2 537.4                 575
## 35  1984-07-01   0.900    72.4 458.0                 575
## 36  1984-10-01   0.900    60.2 377.4                 441
## 37  1985-01-01   0.967    89.4 345.8                 503
## 38  1985-04-01   0.966    65.3 509.2                 613
## 39  1985-07-01   0.667    92.3 469.1                 598
## 40  1985-10-01   1.100    58.5 417.6                 474
## 41  1986-01-01   0.567    63.2 373.8                 507
## 42  1986-04-01  -0.534    38.9 558.4                 594
## 43  1986-07-01   0.667    64.4 489.8                 577
## 44  1986-10-01   0.767    49.8 383.4                 470
## 45  1987-01-01   1.333    66.8 349.1                 511
## 46  1987-04-01   1.267    85.3 480.2                 600
## 47  1987-07-01   1.200    79.0 448.0                 601
## 48  1987-10-01   1.066   122.2 343.3                 487
## 49  1988-01-01   0.900    67.9 297.2                 508
## 50  1988-04-01   1.334   117.1 443.6                 638
## 51  1988-07-01   1.433    91.8 404.9                 612
## 52  1988-10-01   1.300   113.2 342.3                 494
## 53  1989-01-01   1.367   114.7 303.7                 557
## 54  1989-04-01   1.966   101.0 404.3                 681
## 55  1989-07-01   0.967    83.2 366.4                 676
## 56  1989-10-01   1.267    51.8 301.7                 562
## 57  1990-01-01   2.166   127.4 294.6                 775
## 58  1990-04-01   1.267    83.9 357.9                 884
## 59  1990-07-01   2.233    54.8 307.1                 887
## 60  1990-10-01   2.234    -6.2 233.0                 776
## 61  1991-01-01   1.000    31.6 185.4                 805
## 62  1991-04-01   0.800    88.7 300.8                 902
## 63  1991-07-01   1.033    74.8 284.8                 870
## 64  1991-10-01   1.133    60.9 243.0                 750
## 65  1992-01-01   0.934   101.5 262.0                 854
## 66  1992-04-01   1.066   111.5 340.6                 947
## 67  1992-07-01   1.067    94.2 322.1                 891
## 68  1992-10-01   1.233   111.1 274.9                 762
## 69  1993-01-01   1.034    50.6 240.6                 787
## 70  1993-04-01   1.033    81.4 367.2                 888
## 71  1993-07-01   0.667    74.6 355.6                 833
## 72  1993-10-01   1.200   128.6 324.3                 703
## 73  1994-01-01   0.733   103.5 294.0                 770
## 74  1994-04-01   0.833   133.5 422.8                 850
## 75  1994-07-01   1.367    82.5 397.7                 808
## 76  1994-10-01   0.867   124.4 342.5                 710
## 77  1995-01-01   1.100    68.6 269.9                 754
## 78  1995-04-01   1.233    59.6 370.8                 858
## 79  1995-07-01   0.767   101.6 387.3                 820
## 80  1995-10-01   0.833    93.0 326.2                 667
## 81  1996-01-01   1.367    93.6 302.6                 764
## 82  1996-04-01   1.333   168.4 428.5                 877
## 83  1996-07-01   0.900    97.5 410.4                 863
## 84  1996-10-01   1.367   128.1 335.4                 718
## 85  1997-01-01   0.966   115.0 297.3                 778
## 86  1997-04-01   0.367   149.8 419.0                 904
## 87  1997-07-01   0.800   139.9 400.3                 855
## 88  1997-10-01   0.867    96.5 357.4                 704
## 89  1998-01-01   0.333   101.4 324.9                 753
## 90  1998-04-01   0.533   105.0 447.8                 861
## 91  1998-07-01   0.834   151.8 445.0                 836
## 92  1998-10-01   0.766   179.2 399.3                 703
## 93  1999-01-01   0.600   121.4 364.3                 761
## 94  1999-04-01   1.234   109.9 447.2                 887
## 95  1999-07-01   1.233   155.3 445.8                 841
## 96  1999-10-01   1.233   213.8 383.9                 712
## 97  2000-01-01   1.667   104.9 357.1                 698
## 98  2000-04-01   1.333   247.3 448.7                 767
## 99  2000-07-01   1.567    79.1 405.3                 749
## 100 2000-10-01   1.233   114.9 357.5                 651
## 101 2001-01-01   1.667    35.8 347.8                 669
## 102 2001-04-01   1.233   130.3 460.5                 746
## 103 2001-07-01   0.500     1.1 429.2                 727
## 104 2001-10-01  -0.133    61.8 365.4                 624
## 105 2002-01-01   0.567   133.1 369.0                 639
## 106 2002-04-01   1.400   100.4 474.6                 729
## 107 2002-07-01   0.966   102.3 458.5                 704
## 108 2002-10-01   1.067    66.7 402.9                 602
## 109 2003-01-01   1.867   126.3 374.7                 644
## 110 2003-04-01  -0.300   140.6 490.7                 721
## 111 2003-07-01   1.366   254.4 510.9                 695
## 112 2003-10-01   0.700   191.7 471.4                 568
## 113 2004-01-01   1.567   171.6 424.7                 638
## 114 2004-04-01   1.467   193.0 539.4                 728
## 115 2004-07-01   1.200   186.3 531.9                 731
## 116 2004-10-01   2.033   194.5 459.6                 623
## 117 2005-01-01   0.967   251.5 448.2                 631
## 118 2005-04-01   1.300   160.4 575.3                 752
## 119 2005-07-01   2.933   231.3 567.5                 746
## 120 2005-10-01   1.833   176.2 477.1                 650
## 121 2006-01-01   1.034   267.3 464.0                 672
## 122 2006-04-01   1.800   150.9 520.9                 772
## 123 2006-07-01   1.900   108.7 457.8                 766
## 124 2006-10-01  -0.834   157.9 358.2                 663
## 125 2007-01-01   1.984   166.8 321.9                 675
## 126 2007-04-01   2.314   189.1 409.9                 778
## 127 2007-07-01   1.308   147.4 350.6                 767
## 128 2007-10-01   2.551   115.6 272.7                 665
## 129 2008-01-01   2.280   -16.9 231.4                 666
## 130 2008-04-01   2.768   144.6 283.7                 742
## 131 2008-07-01   3.323    30.0 237.0                 714
## 132 2008-10-01  -5.012  -293.1 153.4                 621
## 133 2009-01-01  -1.471  -166.0 114.4                 629
## 134 2009-04-01   1.129   -43.5 153.8                 707
## 135 2009-07-01   1.837    43.7 162.3                 712
## 136 2009-10-01   1.686   182.4 123.4                 616
## 137 2010-01-01   0.344   114.6 134.3                 571
## 138 2010-04-01  -0.077   207.5 172.0                 633
## 139 2010-07-01   0.637   169.1 160.8                 646
## 140 2010-10-01   1.765   172.5 119.8                 536
## 141 2011-01-01   2.345     8.2 125.5                 542
## 142 2011-04-01   2.524   222.5 163.5                 639
## 143 2011-07-01   1.465   126.2 170.9                 622
## 144 2011-10-01   1.014   198.2 148.9                 526
## 145 2012-01-01   1.330   188.6 154.9                 530
## 146 2012-04-01   0.461   148.0 209.3                 634
## 147 2012-07-01   0.931   106.0 214.0                 638
## 148 2012-10-01   1.633    69.4 202.4                 494
## 149 2013-01-01   0.918   178.1 208.1                 521
## 150 2013-04-01  -0.284    66.0 244.2                 652
## 151 2013-07-01   1.177   207.9 242.8                 670
## 152 2013-10-01   1.072   250.6 229.8                 546
## 153 2014-01-01   1.366    25.3 206.0                 565
## 154 2014-04-01   1.120   260.4 274.7                 678
## 155 2014-07-01   0.540   283.8 281.4                 678
## 156 2014-10-01  -0.186   122.8 241.2                 549
## 157 2015-01-01  -1.716    91.4 214.6                 566
## 158 2015-04-01   1.423   214.7 320.4                 678
## 159 2015-07-01   0.810   143.6 318.0                 678
## 160 2015-10-01   0.455    80.9 258.9                 549
## 161 2016-01-01  -0.186    58.8    NA                  NA
## 162 2016-04-01      NA   168.5    NA                  NA
## 163 2016-07-01      NA   207.8    NA                  NA
## 164 1975-10-01      NA      NA 296.6                  NA
## 165                 NA      NA    NA                  NA
summary(HouseAll)
##          DATE        VALUE.x          VALUE.y            HOUST      
##  1976-01-01:  1   Min.   :-5.012   Min.   :-293.10   Min.   :114.4  
##  1976-04-01:  1   1st Qu.: 0.833   1st Qu.:  62.25   1st Qu.:274.7  
##  1976-07-01:  1   Median : 1.120   Median : 101.40   Median :357.4  
##  1976-10-01:  1   Mean   : 1.134   Mean   : 103.63   Mean   :351.6  
##  1977-01-01:  1   3rd Qu.: 1.500   3rd Qu.: 142.10   3rd Qu.:439.3  
##  1977-04-01:  1   Max.   : 3.323   Max.   : 283.80   Max.   :624.5  
##  (Other)   :159   NA's   :4        NA's   :2         NA's   :4      
##  B230RC0Q173SBEA_CHG
##  Min.   :441.0      
##  1st Qu.:574.0      
##  Median :650.5      
##  Mean   :662.0      
##  3rd Qu.:746.8      
##  Max.   :947.0      
##  NA's   :5
HouseAll <- na.omit(HouseAll)

quartdf = as.yearqtr(HouseAll$DATE, format = "%Y-%m-%d")                        

quartdf
##   [1] "1976 Q1" "1976 Q2" "1976 Q3" "1976 Q4" "1977 Q1" "1977 Q2" "1977 Q3"
##   [8] "1977 Q4" "1978 Q1" "1978 Q2" "1978 Q3" "1978 Q4" "1979 Q1" "1979 Q2"
##  [15] "1979 Q3" "1979 Q4" "1980 Q1" "1980 Q2" "1980 Q3" "1980 Q4" "1981 Q1"
##  [22] "1981 Q2" "1981 Q3" "1981 Q4" "1982 Q1" "1982 Q2" "1982 Q3" "1982 Q4"
##  [29] "1983 Q1" "1983 Q2" "1983 Q3" "1983 Q4" "1984 Q1" "1984 Q2" "1984 Q3"
##  [36] "1984 Q4" "1985 Q1" "1985 Q2" "1985 Q3" "1985 Q4" "1986 Q1" "1986 Q2"
##  [43] "1986 Q3" "1986 Q4" "1987 Q1" "1987 Q2" "1987 Q3" "1987 Q4" "1988 Q1"
##  [50] "1988 Q2" "1988 Q3" "1988 Q4" "1989 Q1" "1989 Q2" "1989 Q3" "1989 Q4"
##  [57] "1990 Q1" "1990 Q2" "1990 Q3" "1990 Q4" "1991 Q1" "1991 Q2" "1991 Q3"
##  [64] "1991 Q4" "1992 Q1" "1992 Q2" "1992 Q3" "1992 Q4" "1993 Q1" "1993 Q2"
##  [71] "1993 Q3" "1993 Q4" "1994 Q1" "1994 Q2" "1994 Q3" "1994 Q4" "1995 Q1"
##  [78] "1995 Q2" "1995 Q3" "1995 Q4" "1996 Q1" "1996 Q2" "1996 Q3" "1996 Q4"
##  [85] "1997 Q1" "1997 Q2" "1997 Q3" "1997 Q4" "1998 Q1" "1998 Q2" "1998 Q3"
##  [92] "1998 Q4" "1999 Q1" "1999 Q2" "1999 Q3" "1999 Q4" "2000 Q1" "2000 Q2"
##  [99] "2000 Q3" "2000 Q4" "2001 Q1" "2001 Q2" "2001 Q3" "2001 Q4" "2002 Q1"
## [106] "2002 Q2" "2002 Q3" "2002 Q4" "2003 Q1" "2003 Q2" "2003 Q3" "2003 Q4"
## [113] "2004 Q1" "2004 Q2" "2004 Q3" "2004 Q4" "2005 Q1" "2005 Q2" "2005 Q3"
## [120] "2005 Q4" "2006 Q1" "2006 Q2" "2006 Q3" "2006 Q4" "2007 Q1" "2007 Q2"
## [127] "2007 Q3" "2007 Q4" "2008 Q1" "2008 Q2" "2008 Q3" "2008 Q4" "2009 Q1"
## [134] "2009 Q2" "2009 Q3" "2009 Q4" "2010 Q1" "2010 Q2" "2010 Q3" "2010 Q4"
## [141] "2011 Q1" "2011 Q2" "2011 Q3" "2011 Q4" "2012 Q1" "2012 Q2" "2012 Q3"
## [148] "2012 Q4" "2013 Q1" "2013 Q2" "2013 Q3" "2013 Q4" "2014 Q1" "2014 Q2"
## [155] "2014 Q3" "2014 Q4" "2015 Q1" "2015 Q2" "2015 Q3" "2015 Q4"
quarter <- format(quartdf, format="%q")

quarter
##   [1] "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1"
##  [18] "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2"
##  [35] "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3"
##  [52] "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4"
##  [69] "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1"
##  [86] "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2"
## [103] "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3"
## [120] "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4"
## [137] "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1"
## [154] "2" "3" "4" "1" "2" "3" "4"
HouseAll <- data.frame(HouseAll, quarter)

HouseAll$quarter <- factor(HouseAll$quarter)

levels(HouseAll$quarter) <- c("Q1","Q2","Q3","Q4")

names(HouseAll) <- c("DATE", "CPIValue", "GDPValue", "HOUSTValue", "POPValue", "quarter")

names(HouseAll)
## [1] "DATE"       "CPIValue"   "GDPValue"   "HOUSTValue" "POPValue"  
## [6] "quarter"
summary(HouseAll)
##          DATE        CPIValue         GDPValue         HOUSTValue   
##  1976-01-01:  1   Min.   :-5.012   Min.   :-293.10   Min.   :114.4  
##  1976-04-01:  1   1st Qu.: 0.833   1st Qu.:  62.27   1st Qu.:274.5  
##  1976-07-01:  1   Median : 1.125   Median : 101.20   Median :357.4  
##  1976-10-01:  1   Mean   : 1.143   Mean   : 102.86   Mean   :351.9  
##  1977-01-01:  1   3rd Qu.: 1.500   3rd Qu.: 140.07   3rd Qu.:440.4  
##  1977-04-01:  1   Max.   : 3.323   Max.   : 283.80   Max.   :624.5  
##  (Other)   :154                                                     
##     POPValue     quarter
##  Min.   :441.0   Q1:40  
##  1st Qu.:574.0   Q2:40  
##  Median :650.5   Q3:40  
##  Mean   :662.0   Q4:40  
##  3rd Qu.:746.8          
##  Max.   :947.0          
## 
# FIRST MODEL:

housemod <- lm(HouseAll$HOUSTValue ~ HouseAll$GDPValue + HouseAll$CPIValue + HouseAll$quarter)
sumary(housemod)
##                     Estimate Std. Error t value  Pr(>|t|)
## (Intercept)        271.06863   20.91476 12.9606 < 2.2e-16
## HouseAll$GDPValue    0.22075    0.12069  1.8290 0.0693276
## HouseAll$CPIValue    1.84682    9.83017  0.1879 0.8512239
## HouseAll$quarterQ2 105.33630   23.53808  4.4751 1.477e-05
## HouseAll$quarterQ3  88.28523   23.45485  3.7641 0.0002373
## HouseAll$quarterQ4  30.49725   23.42023  1.3022 0.1948005
## 
## n = 160, p = 6, Residual SE = 104.40269, R-Squared = 0.18
summary(housemod)
## 
## Call:
## lm(formula = HouseAll$HOUSTValue ~ HouseAll$GDPValue + HouseAll$CPIValue + 
##     HouseAll$quarter)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -266.68  -69.19   12.62   71.28  217.21 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        271.0686    20.9148  12.961  < 2e-16 ***
## HouseAll$GDPValue    0.2208     0.1207   1.829 0.069328 .  
## HouseAll$CPIValue    1.8468     9.8302   0.188 0.851224    
## HouseAll$quarterQ2 105.3363    23.5381   4.475 1.48e-05 ***
## HouseAll$quarterQ3  88.2852    23.4548   3.764 0.000237 ***
## HouseAll$quarterQ4  30.4973    23.4202   1.302 0.194801    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 104.4 on 154 degrees of freedom
## Multiple R-squared:  0.1782, Adjusted R-squared:  0.1515 
## F-statistic: 6.677 on 5 and 154 DF,  p-value: 1.173e-05
# Q. (2) Use one-way ANOVA to determine whether there’s a seasonal effect. Show necessary steps and explanation.

require(ggplot2)

ggplot(HouseAll, aes(x = HouseAll$quarter, y = HouseAll$HOUSTValue )) +
  geom_boxplot(fill = "grey80", colour = "blue") +
  scale_x_discrete() + xlab("Quarter Distribution") +
  ylab("New Privately Owned Housing Units Started")

# Initial inspection of the data suggests that there are differences in the housing units start counts for the quarters; but it is not so clear cut to determine whether the quarterly data is different within itself.

# To investigate these differences we fit the one-way ANOVA model using the lm function and look at the parameter estimates and standard errors for the treatment effects. The function call is:

housemod1 <- lm(HouseAll$HOUSTValue ~ HouseAll$quarter, data = HouseAll)

# We save the model fitted to the data in an object so that we can undertake various actions to study the goodness of the fit to the data and other model assumptions. The standard summary of a lm object is used to produce the following output:

summary(housemod1)
## 
## Call:
## lm(formula = HouseAll$HOUSTValue ~ HouseAll$quarter, data = HouseAll)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -250.32  -76.57   17.00   72.60  220.38 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          292.88      16.61  17.633  < 2e-16 ***
## HouseAll$quarterQ2   111.24      23.49   4.736 4.87e-06 ***
## HouseAll$quarterQ3    92.36      23.49   3.932 0.000126 ***
## HouseAll$quarterQ4    32.51      23.49   1.384 0.168265    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 105 on 156 degrees of freedom
## Multiple R-squared:  0.1572, Adjusted R-squared:  0.1409 
## F-statistic: 9.696 on 3 and 156 DF,  p-value: 6.625e-06
round(coef(housemod1),1)
##        (Intercept) HouseAll$quarterQ2 HouseAll$quarterQ3 
##              292.9              111.2               92.4 
## HouseAll$quarterQ4 
##               32.5
round(coef(housemod))
##        (Intercept)  HouseAll$GDPValue  HouseAll$CPIValue 
##                271                  0                  2 
## HouseAll$quarterQ2 HouseAll$quarterQ3 HouseAll$quarterQ4 
##                105                 88                 30
# Q1 is the reference level and has a mean of 271, Q2,3,4 are 105, 88 and 30 units larger, respectively on an average.

# The model output indicates some evidence of a difference in the average count for the quarter groups. An analysis of variance table for this model can be produced via the anova command:

anova(housemod)
## Analysis of Variance Table
## 
## Response: HouseAll$HOUSTValue
##                    Df  Sum Sq Mean Sq F value    Pr(>F)    
## HouseAll$GDPValue   1   76745   76745  7.0408  0.008803 ** 
## HouseAll$CPIValue   1    2025    2025  0.1858  0.667068    
## HouseAll$quarter    3  285122   95041  8.7194 2.229e-05 ***
## Residuals         154 1678588   10900                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# This table confirms that there are differences between the groups which were highlighted in the model summary. The function confint is used to calculate confidence intervals on the treatment parameters, by default 95% confidence intervals:

confint(housemod)
##                           2.5 %      97.5 %
## (Intercept)        229.75176751 312.3854828
## HouseAll$GDPValue   -0.01767501   0.4591816
## HouseAll$CPIValue  -17.57257332  21.2662125
## HouseAll$quarterQ2  58.83709948 151.8354953
## HouseAll$quarterQ3  41.95045749 134.6200077
## HouseAll$quarterQ4 -15.76912706  76.7636333
house.mod = data.frame(Fitted = fitted(housemod),
                       Residuals = resid(housemod), Quarters = HouseAll$quarter)

# and then produce the plot:

ggplot(house.mod, aes(Fitted, Residuals, colour = Quarters)) + geom_point()

# There is some evidence of different variabilities in the spread of the residuals for the quarters.
# Thus, we could determine using one way ANOVA that there is seasonal effect on the New Privately Owned Housing Units Start counts.

# Q. (3) Do pair-wise comparison for different levels. In particular, construct %90 confi- dence intervals for the pairwise differences.

aov(housemod)
## Call:
##    aov(formula = housemod)
## 
## Terms:
##                 HouseAll$GDPValue HouseAll$CPIValue HouseAll$quarter
## Sum of Squares            76744.6            2024.8         285121.6
## Deg. of Freedom                 1                 1                3
##                 Residuals
## Sum of Squares  1678588.0
## Deg. of Freedom       154
## 
## Residual standard error: 104.4027
## Estimated effects may be unbalanced
results <- aov(housemod)

houseconf <- TukeyHSD(results, conf.level = 0.90)
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## HouseAll$GDPValue
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## HouseAll$CPIValue
## Warning in TukeyHSD.aov(results, conf.level = 0.9): 'which' specified some
## non-factors which will be dropped
houseconf
##   Tukey multiple comparisons of means
##     90% family-wise confidence level
## 
## Fit: aov(formula = housemod)
## 
## $`HouseAll$quarter`
##            diff        lwr        upr     p adj
## Q2-Q1 103.40958   49.46254 157.356616 0.0001042
## Q3-Q1  86.73156   32.78452 140.678595 0.0016008
## Q4-Q1  30.09146  -23.85557  84.038500 0.5713311
## Q3-Q2 -16.67802  -70.62506  37.269016 0.8912549
## Q4-Q2 -73.31812 -127.26515 -19.371079 0.0107807
## Q4-Q3 -56.64009 -110.58713  -2.693057 0.0764098
# The intervals can be plotted as seen below:

plot(houseconf)

# Q. (4) Add population to the first model, do the above 2 steps again.

housemod2 <- lm(HouseAll$HOUSTValue ~ HouseAll$GDPValue + HouseAll$CPIValue + HouseAll$quarter + HouseAll$POPValue)

summary(housemod2)
## 
## Call:
## lm(formula = HouseAll$HOUSTValue ~ HouseAll$GDPValue + HouseAll$CPIValue + 
##     HouseAll$quarter + HouseAll$POPValue)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -271.25  -64.11   15.78   70.93  213.90 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        299.00784   53.40285   5.599 9.73e-08 ***
## HouseAll$GDPValue    0.22720    0.12149   1.870  0.06337 .  
## HouseAll$CPIValue    1.88789    9.85210   0.192  0.84829    
## HouseAll$quarterQ2 109.61624   24.76083   4.427 1.81e-05 ***
## HouseAll$quarterQ3  91.91961   24.35938   3.773  0.00023 ***
## HouseAll$quarterQ4  28.98273   23.62236   1.227  0.22174    
## HouseAll$POPValue   -0.04569    0.08032  -0.569  0.57031    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 104.6 on 153 degrees of freedom
## Multiple R-squared:  0.1799, Adjusted R-squared:  0.1477 
## F-statistic: 5.594 on 6 and 153 DF,  p-value: 2.852e-05
round(coef(housemod2))
##        (Intercept)  HouseAll$GDPValue  HouseAll$CPIValue 
##                299                  0                  2 
## HouseAll$quarterQ2 HouseAll$quarterQ3 HouseAll$quarterQ4 
##                110                 92                 29 
##  HouseAll$POPValue 
##                  0
# 4.1) is the reference level and has a mean of 299, Q2,3,4 are 110, 92 and 29 units larger, respectively on an average.

# The model output indicates some evidence of a difference in the average count for the quarter groups. An analysis of variance table for this model can be produced via the anova command:

anova(housemod2)
## Analysis of Variance Table
## 
## Response: HouseAll$HOUSTValue
##                    Df  Sum Sq Mean Sq F value   Pr(>F)    
## HouseAll$GDPValue   1   76745   76745  7.0099 0.008955 ** 
## HouseAll$CPIValue   1    2025    2025  0.1849 0.667759    
## HouseAll$quarter    3  285122   95041  8.6811 2.35e-05 ***
## HouseAll$POPValue   1    3542    3542  0.3236 0.570308    
## Residuals         153 1675046   10948                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# This table confirms that there are differences between the groups which were highlighted in the model summary. The function confint is used to calculate confidence intervals on the treatment parameters, by default 95% confidence intervals:

confint(housemod2)
##                           2.5 %      97.5 %
## (Intercept)        193.50569403 404.5099869
## HouseAll$GDPValue   -0.01280996   0.4672180
## HouseAll$CPIValue  -17.57582059  21.3516062
## HouseAll$quarterQ2  60.69898924 158.5334880
## HouseAll$quarterQ3  43.79544620 140.0437679
## HouseAll$quarterQ4 -17.68538422  75.6508442
## HouseAll$POPValue   -0.20437938   0.1129978
house.mod = data.frame(Fitted = fitted(housemod),
                       Residuals = resid(housemod), Quarters = HouseAll$quarter)

# and then produce the plot:

ggplot(house.mod, aes(Fitted, Residuals, colour = Quarters)) + geom_point()

# There is some evidence of different variabilities in the spread of the residuals for the quarters.
# Thus, we could determine using one way ANOVA that there is seasonal effect on the New Privately Owned Housing Units Start counts.

# Q. (3) Do pair-wise comparison for different levels. In particular, construct %90 confi- dence intervals for the pairwise differences.

aov(housemod2)
## Call:
##    aov(formula = housemod2)
## 
## Terms:
##                 HouseAll$GDPValue HouseAll$CPIValue HouseAll$quarter
## Sum of Squares            76744.6            2024.8         285121.6
## Deg. of Freedom                 1                 1                3
##                 HouseAll$POPValue Residuals
## Sum of Squares             3542.4 1675045.6
## Deg. of Freedom                 1       153
## 
## Residual standard error: 104.6327
## Estimated effects may be unbalanced
results <- aov(housemod2)

houseconf <- TukeyHSD(results, conf.level = 0.90)
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## HouseAll$GDPValue
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## HouseAll$CPIValue
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## HouseAll$POPValue
## Warning in TukeyHSD.aov(results, conf.level = 0.9): 'which' specified some
## non-factors which will be dropped
houseconf
##   Tukey multiple comparisons of means
##     90% family-wise confidence level
## 
## Fit: aov(formula = housemod2)
## 
## $`HouseAll$quarter`
##            diff        lwr        upr     p adj
## Q2-Q1 103.40958   49.34067 157.478488 0.0001089
## Q3-Q1  86.73156   32.66265 140.800466 0.0016512
## Q4-Q1  30.09146  -23.97745  84.160372 0.5731405
## Q3-Q2 -16.67802  -70.74693  37.390888 0.8918810
## Q4-Q2 -73.31812 -127.38703 -19.249207 0.0110230
## Q4-Q3 -56.64009 -110.70900  -2.571186 0.0774173
# The intervals can be plotted as seen below:

plot(houseconf)

# Since all unfactored variables are ignored, also the addition of population to the first model, except for making minute quarterly distribution changes, doesn't make any different significantly.

## -- 2. Transforming data, diagnostics and generalized regression -- ##

House_Prices = read.csv("house_prices.csv", header = TRUE)

summary(House_Prices)
##        ID         Sales.Price        home.ft2       lot.ft2     
##  Min.   :  1.0   Min.   : 84000   Min.   : 980   Min.   : 4560  
##  1st Qu.:131.2   1st Qu.:180000   1st Qu.:1701   1st Qu.:17205  
##  Median :261.5   Median :229900   Median :2061   Median :22200  
##  Mean   :261.5   Mean   :277894   Mean   :2261   Mean   :24370  
##  3rd Qu.:391.8   3rd Qu.:335000   3rd Qu.:2636   3rd Qu.:26787  
##  Max.   :522.0   Max.   :920000   Max.   :5032   Max.   :86830  
##  NA's   :522     NA's   :522      NA's   :522    NA's   :522
head(House_Prices)
##   ID Sales.Price home.ft2 lot.ft2
## 1 NA          NA       NA      NA
## 2  1      360000     3032   22221
## 3 NA          NA       NA      NA
## 4  2      340000     2058   22912
## 5 NA          NA       NA      NA
## 6  3      250000     1780   21345
# Q. (1) Clean the data by omitting NA values.

House_Prices <- na.omit(House_Prices)

summary(House_Prices)
##        ID         Sales.Price        home.ft2       lot.ft2     
##  Min.   :  1.0   Min.   : 84000   Min.   : 980   Min.   : 4560  
##  1st Qu.:131.2   1st Qu.:180000   1st Qu.:1701   1st Qu.:17205  
##  Median :261.5   Median :229900   Median :2061   Median :22200  
##  Mean   :261.5   Mean   :277894   Mean   :2261   Mean   :24370  
##  3rd Qu.:391.8   3rd Qu.:335000   3rd Qu.:2636   3rd Qu.:26787  
##  Max.   :522.0   Max.   :920000   Max.   :5032   Max.   :86830
# Q. (2) Plot the distribution of X1 and X2 against the normal distribution. In other words,
# form a Q-Q plot for both of the variables X1 and X2. Does this distribution have
# heavy tail or not?

qplot(sample = House_Prices$home.ft2, data = House_Prices) + geom_abline(intercept = 0, slope = 1, color="hot pink")

qplot(sample = House_Prices$lot.ft2, data = House_Prices) + geom_abline(intercept = 0, slope = 1, color="hot pink")

# A heavy tail means that this side of the distribution produces outliers at a greater rate than you would expect from a normal distribution.
# Notice the points fall along a line in the middle of the graph, but curve off in the extremities. 
# Normal Q-Q plots that exhibit this behavior usually mean the data has more extreme values than would be expected if they truly came from a Normal distribution.
# Thus, this distribution have heavy tail.

ggplot( House_Prices, aes( x = House_Prices$home.ft2, y = ..density..)) +  
geom_histogram( fill =" cornsilk", colour =" grey60", size =.2, binwidth=3) + geom_density()

ggplot( House_Prices, aes( x = House_Prices$lot.ft2, y =..density..)) +  
geom_histogram( fill =" cornsilk", colour =" grey60", size =.2, binwidth=3) + geom_density()

# Q. (3)

names(House_Prices) <- c("ID","Y","X1", "X2")

Z1 = log(House_Prices$X1)
Z2 = log(House_Prices$X2) 

Z3 = sqrt(House_Prices$X1)
Z4 = sqrt(House_Prices$X2)

logfit <- lm(Y ~ Z1 + Z2, House_Prices)

sqrtfit <- lm(Y ~ Z3 + Z4, House_Prices)

realfit <- lm(Y ~ X1 + X2, House_Prices)

summary(logfit)
## 
## Call:
## lm(formula = Y ~ Z1 + Z2, data = House_Prices)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -215228  -49365   -4868   30572  416630 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2861006     114216  -25.05  < 2e-16 ***
## Z1            369395      12415   29.75  < 2e-16 ***
## Z2             30202       8805    3.43 0.000651 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 80910 on 519 degrees of freedom
## Multiple R-squared:  0.6572, Adjusted R-squared:  0.6559 
## F-statistic: 497.6 on 2 and 519 DF,  p-value: < 2.2e-16
summary(sqrtfit)
## 
## Call:
## lm(formula = Y ~ Z3 + Z4, data = House_Prices)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -222990  -42893   -3130   28052  397902 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -507552.5    25835.9 -19.645  < 2e-16 ***
## Z3            15439.2      494.2  31.240  < 2e-16 ***
## Z4              391.0      106.4   3.674 0.000263 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78840 on 519 degrees of freedom
## Multiple R-squared:  0.6745, Adjusted R-squared:  0.6733 
## F-statistic: 537.8 on 2 and 519 DF,  p-value: < 2.2e-16
summary(realfit)
## 
## Call:
## lm(formula = Y ~ X1 + X2, data = House_Prices)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -228421  -38178   -5506   25494  383423 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.027e+05  1.265e+04  -8.121 3.39e-15 ***
## X1           1.560e+02  4.871e+00  32.019  < 2e-16 ***
## X2           1.151e+00  2.964e-01   3.882 0.000117 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78070 on 519 degrees of freedom
## Multiple R-squared:  0.6808, Adjusted R-squared:  0.6796 
## F-statistic: 553.5 on 2 and 519 DF,  p-value: < 2.2e-16
modtrans1 = fortify(logfit)

modtrans2 = fortify(sqrtfit)

moduntrans = fortify(realfit)

p1 <- qplot(sample = scale(.resid), data = moduntrans) + geom_abline(intercept = 0, 
                                                                slope = 1, color = "red") + labs(title = "Untransformed y", y = "Residuals")
p2 <- qplot(sample = scale(.resid), data = modtrans1) + geom_abline(intercept = 0, 
                                                                slope = 1, color = "red") + labs(title = "Log-Tranformed y", y = "Residuals")
p3 <- qplot(sample = scale(.resid), data = modtrans2) + geom_abline(intercept = 0, 
                                                                slope = 1, color = "red") + labs(title = "Sqrt-Tranformed y", y = "Residuals")
grid.arrange(p1, p2, p3, nrow = 3)

p1 <- qplot(scale(.resid), data = moduntrans, geom = "blank") + geom_line(aes(y = ..density.., 
                                                                         colour = "Empirical"), stat = "density") + stat_function(fun = dnorm, aes(colour = "Normal")) + 
  geom_histogram(aes(y = ..density..), alpha = 0.4) + scale_colour_manual(name = "Density", 
                                                                          values = c("red", "blue")) + theme(legend.position = c(0.85, 0.85)) + labs(title = "Untransformed y", 
                                                                                                                                                     y = "Residuals")
p2 <- qplot(scale(.resid), data = modtrans1, geom = "blank") + geom_line(aes(y = ..density.., 
                                                                         colour = "Empirical"), stat = "density") + stat_function(fun = dnorm, aes(colour = "Normal")) + 
  geom_histogram(aes(y = ..density..), alpha = 0.4) + scale_colour_manual(name = "Density", 
                                                                          values = c("red", "blue")) + theme(legend.position = c(0.85, 0.85)) + labs(title = "Log-Tranformed y", 
                                                                                                                                                     y = "Residuals")

p3 <- qplot(scale(.resid), data = modtrans2, geom = "blank") + geom_line(aes(y = ..density.., 
                                                                         colour = "Empirical"), stat = "density") + stat_function(fun = dnorm, aes(colour = "Normal")) + 
  geom_histogram(aes(y = ..density..), alpha = 0.4) + scale_colour_manual(name = "Density", 
                                                                          values = c("red", "blue")) + theme(legend.position = c(0.85, 0.85)) + labs(title = "Sqrt-Tranformed y", 
                                                                                                                                                     y = "Residuals")

grid.arrange(p1, p2, p3, nrow = 3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

boxcox(logfit, plotit=T)

boxcox(sqrtfit, plotit=T)

boxcox(realfit, plotit=T)

# From all the analysis done above, The transformed models looks better to me in comparison to the original model. Out of which, the sqrt-transform fits well on our data than the log-transform. Also, R-square value is better for sqrt-transformed model.
# Clearly the residuals of the untransformed model are not normal. The residuals of the sqrt-tranformed model look much better.


# Q. (4) 

# 4.a. Plot norm of the residuals to see if they look similar in terms of magnitude.

p1 <- qplot(.fitted, .resid, data = modtrans1) + geom_hline(yintercept = 0, linetype = "dashed") + 
  labs(title = "Residuals vs Fitted for Log Transform", x = "Fitted", y = "Residuals") + geom_smooth(color = "red", 
                                                                                   se = F)

p2 <- qplot(.fitted, abs(.resid), data = modtrans1) + geom_hline(yintercept = 0, 
                                                             linetype = "dashed") + labs(title = "Scale-Location", x = "Fitted", y = "|Residuals|") + geom_smooth(method = "lm", color = "red", se = F)


p3 <- qplot(.fitted, .resid, data = modtrans2) + geom_hline(yintercept = 0, linetype = "dashed") + 
  labs(title = "Residuals vs Fitted for Sqrt Transform", x = "Fitted", y = "Residuals") + geom_smooth(color = "red", se = F)


p4 <- qplot(.fitted, abs(.resid), data = modtrans2) + geom_hline(yintercept = 0, 
                                                             linetype = "dashed") + labs(title = "Scale-Location", x = "Fitted", y = "|Residuals|") + geom_smooth(method = "lm", color = "red", se = F)


grid.arrange(p1, p2, p3, p4, nrow = 2)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'

# Here we use the Residual vs Fitted plot to detect patterns in residuals that would indicate nonconstant error variance.
# The second plot is called the Scale-Location plot, which strengthens the pattern in the residuals by ploting the absolute values.
# In both the plots, we see some evidence of heterskedasticity, in other words nonconstant error variance.
# Though, there is similarity in the normality for residuals for both the transforms, both differ very slighlty in terms of values in the plot.


# 4.b. 

group_A = sqldf('select *, "A" as Group_type from House_Prices where ID >=1 and ID  <= 130')
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
group_B = sqldf('select *, "B" as Group_type from House_Prices where ID >=131 and ID  <= 260')
group_C = sqldf('select *, "C" as Group_type from House_Prices where ID >=261 and ID  <= 390')
group_D = sqldf('select *, "D" as Group_type from House_Prices where ID >=391 and ID  <= 522')

House_Prices <- rbind(group_A,group_B,group_C,group_D)

summary(House_Prices)
##        ID              Y                X1             X2       
##  Min.   :  1.0   Min.   : 84000   Min.   : 980   Min.   : 4560  
##  1st Qu.:131.2   1st Qu.:180000   1st Qu.:1701   1st Qu.:17205  
##  Median :261.5   Median :229900   Median :2061   Median :22200  
##  Mean   :261.5   Mean   :277894   Mean   :2261   Mean   :24370  
##  3rd Qu.:391.8   3rd Qu.:335000   3rd Qu.:2636   3rd Qu.:26787  
##  Max.   :522.0   Max.   :920000   Max.   :5032   Max.   :86830  
##   Group_type       
##  Length:522        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
House_Prices$ID = factor(House_Prices$ID)

House_Prices$Group_type = factor(House_Prices$Group_type)

levels(House_Prices$Group_type)
## [1] "A" "B" "C" "D"
summary(House_Prices)
##        ID            Y                X1             X2        Group_type
##  1      :  1   Min.   : 84000   Min.   : 980   Min.   : 4560   A:130     
##  2      :  1   1st Qu.:180000   1st Qu.:1701   1st Qu.:17205   B:130     
##  3      :  1   Median :229900   Median :2061   Median :22200   C:130     
##  4      :  1   Mean   :277894   Mean   :2261   Mean   :24370   D:132     
##  5      :  1   3rd Qu.:335000   3rd Qu.:2636   3rd Qu.:26787             
##  6      :  1   Max.   :920000   Max.   :5032   Max.   :86830             
##  (Other):516
log.resid = lm(resid(logfit) ~ House_Prices$Group_type)

summary(log.resid)
## 
## Call:
## lm(formula = resid(logfit) ~ House_Prices$Group_type)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -255325  -42703    -388   35046  376533 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 40097       6756   5.935 5.38e-09 ***
## House_Prices$Group_typeB   -49892       9554  -5.222 2.57e-07 ***
## House_Prices$Group_typeC   -68025       9554  -7.120 3.63e-12 ***
## House_Prices$Group_typeD   -42435       9518  -4.458 1.01e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 77030 on 518 degrees of freedom
## Multiple R-squared:  0.09525,    Adjusted R-squared:  0.09001 
## F-statistic: 18.18 on 3 and 518 DF,  p-value: 3.143e-11
sqrt.resid = lm(resid(sqrtfit) ~ House_Prices$Group_type)

summary(sqrt.resid)
## 
## Call:
## lm(formula = resid(sqrtfit) ~ House_Prices$Group_type)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -258881  -37979    1340   33907  362011 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 35891       6648   5.399 1.02e-07 ***
## House_Prices$Group_typeB   -43134       9402  -4.588 5.63e-06 ***
## House_Prices$Group_typeC   -59333       9402  -6.311 5.97e-10 ***
## House_Prices$Group_typeD   -41019       9366  -4.379 1.44e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 75800 on 518 degrees of freedom
## Multiple R-squared:  0.07725,    Adjusted R-squared:  0.07191 
## F-statistic: 14.46 on 3 and 518 DF,  p-value: 4.674e-09
summary(lm(abs(residuals(log.resid)) ~ fitted(log.resid)))
## 
## Call:
## lm(formula = abs(residuals(log.resid)) ~ fitted(log.resid))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -89478 -33747  -8340  24605 286204 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.489e+04  2.144e+03   25.60   <2e-16 ***
## fitted(log.resid) 8.838e-01  8.612e-02   10.26   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48990 on 520 degrees of freedom
## Multiple R-squared:  0.1684, Adjusted R-squared:  0.1668 
## F-statistic: 105.3 on 1 and 520 DF,  p-value: < 2.2e-16
summary(lm(abs(residuals(sqrt.resid)) ~ fitted(sqrt.resid)))
## 
## Call:
## lm(formula = abs(residuals(sqrt.resid)) ~ fitted(sqrt.resid))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -90773 -31545  -8938  21048 271102 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.325e+04  2.122e+03   25.09   <2e-16 ***
## fitted(sqrt.resid) 1.049e+00  9.711e-02   10.80   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48480 on 520 degrees of freedom
## Multiple R-squared:  0.1834, Adjusted R-squared:  0.1818 
## F-statistic: 116.8 on 1 and 520 DF,  p-value: < 2.2e-16
shapiro.test(residuals(log.resid))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(log.resid)
## W = 0.95624, p-value = 2.511e-11
shapiro.test(residuals(sqrt.resid))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(sqrt.resid)
## W = 0.9503, p-value = 3.052e-12
qqnorm(residuals(log.resid))
qqline(residuals(log.resid))

plot(jitter(fitted(log.resid)),residuals(log.resid),xlab="Fitted",ylab="
    Residuals")
abline(h=0)

# From, Shapiro-Wilk Test, the null hypothesis is that the the residuals are normal. Since the p-value is very less, we reject this hypothesis.
# Both the residuals of the tranformations show non-normal residuals.
# From both our tranformed models and their respective residuals; also from their respective plots, we can easily conclude that there is no homogeneity of the error variance, i.e. there is non-constant variance.


# 4.c. 

bartlett.test(resid(logfit) ~ House_Prices$Group_type)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  resid(logfit) by House_Prices$Group_type
## Bartlett's K-squared = 175.06, df = 3, p-value < 2.2e-16
bartlett.test(resid(sqrtfit) ~ House_Prices$Group_type)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  resid(sqrtfit) by House_Prices$Group_type
## Bartlett's K-squared = 201.8, df = 3, p-value < 2.2e-16
# Since the p-value is very small, we conclude that there is evidence of a non-constant variance for both the tranformations.

## Hence, we have proved from all the three methods above that both the transformed models have constant variance.


# Q. (5) 

# Since we have dependent errors in the transformed data, we use Generalized Least Squares technique.

logg1 <- gls(Y ~ Z1 + Z2, House_Prices)

summary(logg1)
## Generalized least squares fit by REML
##   Model: Y ~ Z1 + Z2 
##   Data: House_Prices 
##        AIC      BIC    logLik
##   13225.83 13242.84 -6608.916
## 
## Coefficients:
##                  Value Std.Error    t-value p-value
## (Intercept) -2861006.5 114216.37 -25.049007   0e+00
## Z1            369395.0  12415.14  29.753592   0e+00
## Z2             30202.3   8805.19   3.430062   7e-04
## 
##  Correlation: 
##    (Intr) Z1    
## Z1 -0.659       
## Z2 -0.582 -0.228
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.66023596 -0.61015373 -0.06016272  0.37787308  5.14958080 
## 
## Residual standard error: 80905.68 
## Degrees of freedom: 522 total; 519 residual
sqrtg2 <- gls(Y ~ Z3 + Z4, House_Prices)

summary(sqrtg2)
## Generalized least squares fit by REML
##   Model: Y ~ Z3 + Z4 
##   Data: House_Prices 
##        AIC     BIC    logLik
##   13214.09 13231.1 -6603.046
## 
## Coefficients:
##                 Value Std.Error    t-value p-value
## (Intercept) -507552.5 25835.945 -19.645209   0e+00
## Z3            15439.2   494.214  31.239946   0e+00
## Z4              391.0   106.411   3.674232   3e-04
## 
##  Correlation: 
##    (Intr) Z3    
## Z3 -0.776       
## Z4 -0.452 -0.196
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -2.8285434 -0.5440775 -0.0397021  0.3558287  5.0472318 
## 
## Residual standard error: 78835.72 
## Degrees of freedom: 522 total; 519 residual
# Since, the Residual standard errors are very bad for the GLS model, i.e for both the transformed models,
# we can say that the fit done by the generalized least squares technique worsens the tansformed data.