getwd()
## [1] "C:/Users/TANAY/Documents"
house<- read.csv("train_housing.csv")

summary(house)
##        Id           MSSubClass       MSZoning     LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   C (all):  10   Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   FV     :  65   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   RH     :  16   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9   RL     :1151   Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   RM     : 218   3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                  Max.   :313.00  
##                                                  NA's   :259     
##     LotArea        Street      Alley      LotShape  LandContour
##  Min.   :  1300   Grvl:   6   Grvl:  50   IR1:484   Bnk:  63   
##  1st Qu.:  7554   Pave:1454   Pave:  41   IR2: 41   HLS:  50   
##  Median :  9478               NA's:1369   IR3: 10   Low:  36   
##  Mean   : 10517                           Reg:925   Lvl:1311   
##  3rd Qu.: 11602                                                
##  Max.   :215245                                                
##                                                                
##   Utilities      LotConfig    LandSlope   Neighborhood   Condition1  
##  AllPub:1459   Corner : 263   Gtl:1382   NAmes  :225   Norm   :1260  
##  NoSeWa:   1   CulDSac:  94   Mod:  65   CollgCr:150   Feedr  :  81  
##                FR2    :  47   Sev:  13   OldTown:113   Artery :  48  
##                FR3    :   4              Edwards:100   RRAn   :  26  
##                Inside :1052              Somerst: 86   PosN   :  19  
##                                          Gilbert: 79   RRAe   :  11  
##                                          (Other):707   (Other):  15  
##    Condition2     BldgType      HouseStyle   OverallQual    
##  Norm   :1445   1Fam  :1220   1Story :726   Min.   : 1.000  
##  Feedr  :   6   2fmCon:  31   2Story :445   1st Qu.: 5.000  
##  Artery :   2   Duplex:  52   1.5Fin :154   Median : 6.000  
##  PosN   :   2   Twnhs :  43   SLvl   : 65   Mean   : 6.099  
##  RRNn   :   2   TwnhsE: 114   SFoyer : 37   3rd Qu.: 7.000  
##  PosA   :   1                 1.5Unf : 14   Max.   :10.000  
##  (Other):   2                 (Other): 19                   
##   OverallCond      YearBuilt     YearRemodAdd    RoofStyle   
##  Min.   :1.000   Min.   :1872   Min.   :1950   Flat   :  13  
##  1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967   Gable  :1141  
##  Median :5.000   Median :1973   Median :1994   Gambrel:  11  
##  Mean   :5.575   Mean   :1971   Mean   :1985   Hip    : 286  
##  3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004   Mansard:   7  
##  Max.   :9.000   Max.   :2010   Max.   :2010   Shed   :   2  
##                                                              
##     RoofMatl     Exterior1st   Exterior2nd    MasVnrType    MasVnrArea    
##  CompShg:1434   VinylSd:515   VinylSd:504   BrkCmn : 15   Min.   :   0.0  
##  Tar&Grv:  11   HdBoard:222   MetalSd:214   BrkFace:445   1st Qu.:   0.0  
##  WdShngl:   6   MetalSd:220   HdBoard:207   None   :864   Median :   0.0  
##  WdShake:   5   Wd Sdng:206   Wd Sdng:197   Stone  :128   Mean   : 103.7  
##  ClyTile:   1   Plywood:108   Plywood:142   NA's   :  8   3rd Qu.: 166.0  
##  Membran:   1   CemntBd: 61   CmentBd: 60                 Max.   :1600.0  
##  (Other):   2   (Other):128   (Other):136                 NA's   :8       
##  ExterQual ExterCond  Foundation  BsmtQual   BsmtCond    BsmtExposure
##  Ex: 52    Ex:   3   BrkTil:146   Ex  :121   Fa  :  45   Av  :221    
##  Fa: 14    Fa:  28   CBlock:634   Fa  : 35   Gd  :  65   Gd  :134    
##  Gd:488    Gd: 146   PConc :647   Gd  :618   Po  :   2   Mn  :114    
##  TA:906    Po:   1   Slab  : 24   TA  :649   TA  :1311   No  :953    
##            TA:1282   Stone :  6   NA's: 37   NA's:  37   NA's: 38    
##                      Wood  :  3                                      
##                                                                      
##  BsmtFinType1   BsmtFinSF1     BsmtFinType2   BsmtFinSF2     
##  ALQ :220     Min.   :   0.0   ALQ :  19    Min.   :   0.00  
##  BLQ :148     1st Qu.:   0.0   BLQ :  33    1st Qu.:   0.00  
##  GLQ :418     Median : 383.5   GLQ :  14    Median :   0.00  
##  LwQ : 74     Mean   : 443.6   LwQ :  46    Mean   :  46.55  
##  Rec :133     3rd Qu.: 712.2   Rec :  54    3rd Qu.:   0.00  
##  Unf :430     Max.   :5644.0   Unf :1256    Max.   :1474.00  
##  NA's: 37                      NA's:  38                     
##    BsmtUnfSF       TotalBsmtSF      Heating     HeatingQC CentralAir
##  Min.   :   0.0   Min.   :   0.0   Floor:   1   Ex:741    N:  95    
##  1st Qu.: 223.0   1st Qu.: 795.8   GasA :1428   Fa: 49    Y:1365    
##  Median : 477.5   Median : 991.5   GasW :  18   Gd:241              
##  Mean   : 567.2   Mean   :1057.4   Grav :   7   Po:  1              
##  3rd Qu.: 808.0   3rd Qu.:1298.2   OthW :   2   TA:428              
##  Max.   :2336.0   Max.   :6110.0   Wall :   4                       
##                                                                     
##  Electrical     X1stFlrSF      X2ndFlrSF     LowQualFinSF    
##  FuseA:  94   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  FuseF:  27   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  FuseP:   3   Median :1087   Median :   0   Median :  0.000  
##  Mix  :   1   Mean   :1163   Mean   : 347   Mean   :  5.845  
##  SBrkr:1334   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##  NA's :   1   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                              
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr   KitchenQual
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Ex:100     
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   Fa: 39     
##  Median :0.0000   Median :3.000   Median :1.000   Gd:586     
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   TA:735     
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000              
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000              
##                                                              
##   TotRmsAbvGrd    Functional    Fireplaces    FireplaceQu   GarageType 
##  Min.   : 2.000   Maj1:  14   Min.   :0.000   Ex  : 24    2Types :  6  
##  1st Qu.: 5.000   Maj2:   5   1st Qu.:0.000   Fa  : 33    Attchd :870  
##  Median : 6.000   Min1:  31   Median :1.000   Gd  :380    Basment: 19  
##  Mean   : 6.518   Min2:  34   Mean   :0.613   Po  : 20    BuiltIn: 88  
##  3rd Qu.: 7.000   Mod :  15   3rd Qu.:1.000   TA  :313    CarPort:  9  
##  Max.   :14.000   Sev :   1   Max.   :3.000   NA's:690    Detchd :387  
##                   Typ :1360                               NA's   : 81  
##   GarageYrBlt   GarageFinish   GarageCars      GarageArea     GarageQual 
##  Min.   :1900   Fin :352     Min.   :0.000   Min.   :   0.0   Ex  :   3  
##  1st Qu.:1961   RFn :422     1st Qu.:1.000   1st Qu.: 334.5   Fa  :  48  
##  Median :1980   Unf :605     Median :2.000   Median : 480.0   Gd  :  14  
##  Mean   :1979   NA's: 81     Mean   :1.767   Mean   : 473.0   Po  :   3  
##  3rd Qu.:2002                3rd Qu.:2.000   3rd Qu.: 576.0   TA  :1311  
##  Max.   :2010                Max.   :4.000   Max.   :1418.0   NA's:  81  
##  NA's   :81                                                              
##  GarageCond  PavedDrive   WoodDeckSF      OpenPorchSF     EnclosedPorch   
##  Ex  :   2   N:  90     Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  Fa  :  35   P:  30     1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Gd  :   9   Y:1340     Median :  0.00   Median : 25.00   Median :  0.00  
##  Po  :   7              Mean   : 94.24   Mean   : 46.66   Mean   : 21.95  
##  TA  :1326              3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00  
##  NA's:  81              Max.   :857.00   Max.   :547.00   Max.   :552.00  
##                                                                           
##    X3SsnPorch      ScreenPorch        PoolArea        PoolQC    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Ex  :   2  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000   Fa  :   2  
##  Median :  0.00   Median :  0.00   Median :  0.000   Gd  :   3  
##  Mean   :  3.41   Mean   : 15.06   Mean   :  2.759   NA's:1453  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000              
##  Max.   :508.00   Max.   :480.00   Max.   :738.000              
##                                                                 
##    Fence      MiscFeature    MiscVal             MoSold      
##  GdPrv:  59   Gar2:   2   Min.   :    0.00   Min.   : 1.000  
##  GdWo :  54   Othr:   2   1st Qu.:    0.00   1st Qu.: 5.000  
##  MnPrv: 157   Shed:  49   Median :    0.00   Median : 6.000  
##  MnWw :  11   TenC:   1   Mean   :   43.49   Mean   : 6.322  
##  NA's :1179   NA's:1406   3rd Qu.:    0.00   3rd Qu.: 8.000  
##                           Max.   :15500.00   Max.   :12.000  
##                                                              
##      YrSold        SaleType    SaleCondition    SalePrice     
##  Min.   :2006   WD     :1267   Abnorml: 101   Min.   : 34900  
##  1st Qu.:2007   New    : 122   AdjLand:   4   1st Qu.:129975  
##  Median :2008   COD    :  43   Alloca :  12   Median :163000  
##  Mean   :2008   ConLD  :   9   Family :  20   Mean   :180921  
##  3rd Qu.:2009   ConLI  :   5   Normal :1198   3rd Qu.:214000  
##  Max.   :2010   ConLw  :   5   Partial: 125   Max.   :755000  
##                 (Other):   9
select_var <- c('Id','MSZoning','Utilities', 'Neighborhood','BldgType','HouseStyle',
                'OverallQual','OverallCond','YearBuilt', 'ExterQual','ExterCond',
                'BsmtQual','BsmtCond','TotalBsmtSF','Heating','HeatingQC', 
                'CentralAir','Electrical','GrLivArea','BedroomAbvGr','KitchenAbvGr',
                'KitchenQual','TotRmsAbvGrd','Functional','Fireplaces','FireplaceQu',
               'GarageArea','GarageQual','GarageCond','OpenPorchSF','PoolArea',
                'Fence','MoSold','YrSold','SaleType','SaleCondition','SalePrice')

select_train <- house[,select_var]
head(select_train)
##   Id MSZoning Utilities Neighborhood BldgType HouseStyle OverallQual
## 1  1       RL    AllPub      CollgCr     1Fam     2Story           7
## 2  2       RL    AllPub      Veenker     1Fam     1Story           6
## 3  3       RL    AllPub      CollgCr     1Fam     2Story           7
## 4  4       RL    AllPub      Crawfor     1Fam     2Story           7
## 5  5       RL    AllPub      NoRidge     1Fam     2Story           8
## 6  6       RL    AllPub      Mitchel     1Fam     1.5Fin           5
##   OverallCond YearBuilt ExterQual ExterCond BsmtQual BsmtCond TotalBsmtSF
## 1           5      2003        Gd        TA       Gd       TA         856
## 2           8      1976        TA        TA       Gd       TA        1262
## 3           5      2001        Gd        TA       Gd       TA         920
## 4           5      1915        TA        TA       TA       Gd         756
## 5           5      2000        Gd        TA       Gd       TA        1145
## 6           5      1993        TA        TA       Gd       TA         796
##   Heating HeatingQC CentralAir Electrical GrLivArea BedroomAbvGr
## 1    GasA        Ex          Y      SBrkr      1710            3
## 2    GasA        Ex          Y      SBrkr      1262            3
## 3    GasA        Ex          Y      SBrkr      1786            3
## 4    GasA        Gd          Y      SBrkr      1717            3
## 5    GasA        Ex          Y      SBrkr      2198            4
## 6    GasA        Ex          Y      SBrkr      1362            1
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Gd            8        Typ          0        <NA>
## 2            1          TA            6        Typ          1          TA
## 3            1          Gd            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            9        Typ          1          TA
## 6            1          TA            5        Typ          0        <NA>
##   GarageArea GarageQual GarageCond OpenPorchSF PoolArea Fence MoSold
## 1        548         TA         TA          61        0  <NA>      2
## 2        460         TA         TA           0        0  <NA>      5
## 3        608         TA         TA          42        0  <NA>      9
## 4        642         TA         TA          35        0  <NA>      2
## 5        836         TA         TA          84        0  <NA>     12
## 6        480         TA         TA          30        0 MnPrv     10
##   YrSold SaleType SaleCondition SalePrice
## 1   2008       WD        Normal    208500
## 2   2007       WD        Normal    181500
## 3   2008       WD        Normal    223500
## 4   2006       WD       Abnorml    140000
## 5   2008       WD        Normal    250000
## 6   2009       WD        Normal    143000
summary(select_train)
##        Id            MSZoning     Utilities     Neighborhood   BldgType   
##  Min.   :   1.0   C (all):  10   AllPub:1459   NAmes  :225   1Fam  :1220  
##  1st Qu.: 365.8   FV     :  65   NoSeWa:   1   CollgCr:150   2fmCon:  31  
##  Median : 730.5   RH     :  16                 OldTown:113   Duplex:  52  
##  Mean   : 730.5   RL     :1151                 Edwards:100   Twnhs :  43  
##  3rd Qu.:1095.2   RM     : 218                 Somerst: 86   TwnhsE: 114  
##  Max.   :1460.0                                Gilbert: 79                
##                                                (Other):707                
##    HouseStyle   OverallQual      OverallCond      YearBuilt    ExterQual
##  1Story :726   Min.   : 1.000   Min.   :1.000   Min.   :1872   Ex: 52   
##  2Story :445   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954   Fa: 14   
##  1.5Fin :154   Median : 6.000   Median :5.000   Median :1973   Gd:488   
##  SLvl   : 65   Mean   : 6.099   Mean   :5.575   Mean   :1971   TA:906   
##  SFoyer : 37   3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000            
##  1.5Unf : 14   Max.   :10.000   Max.   :9.000   Max.   :2010            
##  (Other): 19                                                            
##  ExterCond BsmtQual   BsmtCond     TotalBsmtSF      Heating     HeatingQC
##  Ex:   3   Ex  :121   Fa  :  45   Min.   :   0.0   Floor:   1   Ex:741   
##  Fa:  28   Fa  : 35   Gd  :  65   1st Qu.: 795.8   GasA :1428   Fa: 49   
##  Gd: 146   Gd  :618   Po  :   2   Median : 991.5   GasW :  18   Gd:241   
##  Po:   1   TA  :649   TA  :1311   Mean   :1057.4   Grav :   7   Po:  1   
##  TA:1282   NA's: 37   NA's:  37   3rd Qu.:1298.2   OthW :   2   TA:428   
##                                   Max.   :6110.0   Wall :   4            
##                                                                          
##  CentralAir Electrical     GrLivArea     BedroomAbvGr    KitchenAbvGr  
##  N:  95     FuseA:  94   Min.   : 334   Min.   :0.000   Min.   :0.000  
##  Y:1365     FuseF:  27   1st Qu.:1130   1st Qu.:2.000   1st Qu.:1.000  
##             FuseP:   3   Median :1464   Median :3.000   Median :1.000  
##             Mix  :   1   Mean   :1515   Mean   :2.866   Mean   :1.047  
##             SBrkr:1334   3rd Qu.:1777   3rd Qu.:3.000   3rd Qu.:1.000  
##             NA's :   1   Max.   :5642   Max.   :8.000   Max.   :3.000  
##                                                                        
##  KitchenQual  TotRmsAbvGrd    Functional    Fireplaces    FireplaceQu
##  Ex:100      Min.   : 2.000   Maj1:  14   Min.   :0.000   Ex  : 24   
##  Fa: 39      1st Qu.: 5.000   Maj2:   5   1st Qu.:0.000   Fa  : 33   
##  Gd:586      Median : 6.000   Min1:  31   Median :1.000   Gd  :380   
##  TA:735      Mean   : 6.518   Min2:  34   Mean   :0.613   Po  : 20   
##              3rd Qu.: 7.000   Mod :  15   3rd Qu.:1.000   TA  :313   
##              Max.   :14.000   Sev :   1   Max.   :3.000   NA's:690   
##                               Typ :1360                              
##    GarageArea     GarageQual  GarageCond   OpenPorchSF    
##  Min.   :   0.0   Ex  :   3   Ex  :   2   Min.   :  0.00  
##  1st Qu.: 334.5   Fa  :  48   Fa  :  35   1st Qu.:  0.00  
##  Median : 480.0   Gd  :  14   Gd  :   9   Median : 25.00  
##  Mean   : 473.0   Po  :   3   Po  :   7   Mean   : 46.66  
##  3rd Qu.: 576.0   TA  :1311   TA  :1326   3rd Qu.: 68.00  
##  Max.   :1418.0   NA's:  81   NA's:  81   Max.   :547.00  
##                                                           
##     PoolArea         Fence          MoSold           YrSold    
##  Min.   :  0.000   GdPrv:  59   Min.   : 1.000   Min.   :2006  
##  1st Qu.:  0.000   GdWo :  54   1st Qu.: 5.000   1st Qu.:2007  
##  Median :  0.000   MnPrv: 157   Median : 6.000   Median :2008  
##  Mean   :  2.759   MnWw :  11   Mean   : 6.322   Mean   :2008  
##  3rd Qu.:  0.000   NA's :1179   3rd Qu.: 8.000   3rd Qu.:2009  
##  Max.   :738.000                Max.   :12.000   Max.   :2010  
##                                                                
##     SaleType    SaleCondition    SalePrice     
##  WD     :1267   Abnorml: 101   Min.   : 34900  
##  New    : 122   AdjLand:   4   1st Qu.:129975  
##  COD    :  43   Alloca :  12   Median :163000  
##  ConLD  :   9   Family :  20   Mean   :180921  
##  ConLI  :   5   Normal :1198   3rd Qu.:214000  
##  ConLw  :   5   Partial: 125   Max.   :755000  
##  (Other):   9
summary(select_train$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
summary(select_train$MSZoning)
## C (all)      FV      RH      RL      RM 
##      10      65      16    1151     218
summary(select_train$SaleCondition)
## Abnorml AdjLand  Alloca  Family  Normal Partial 
##     101       4      12      20    1198     125
summary(select_train$Utilities)
## AllPub NoSeWa 
##   1459      1
summary(select_train$HouseStyle)
## 1.5Fin 1.5Unf 1Story 2.5Fin 2.5Unf 2Story SFoyer   SLvl 
##    154     14    726      8     11    445     37     65
library(plyr)
ddply(select_train, .(BldgType), summarize,Total = length(BldgType),Max_price=max(SalePrice),Min_price=min(SalePrice))
##   BldgType Total Max_price Min_price
## 1     1Fam  1220    755000     34900
## 2   2fmCon    31    228950     55000
## 3   Duplex    52    206300     82000
## 4    Twnhs    43    230000     75000
## 5   TwnhsE   114    392500     75500
select_train$HouseStyle2 <- as.numeric(factor(select_train$HouseStyle, 
                                  levels = c("1.5Fin", "1.5Unf","1Story", "2.5Fin","2.5Unf","2Story","SFoyer","SLvl"),
                                  labels = c(8,7,6,5,2,4,3,1) ,ordered = TRUE))

PLOTS

library(lattice)
library(ggplot2)
histogram(select_train$SalePrice, Main="Range of Sale Price", xlab="Sale price", ylab="Count of houses")

boxplot(select_train$SalePrice, Main="Range of Sale Price", xlab="Sale price", horizontal = TRUE)

ggplot(select_train, aes(x = SalePrice, fill = ..count..)) +
  geom_histogram(binwidth = 5000) +
  ggtitle("Figure 1 Histogram of SalePrice") +
  ylab("Count of houses") +
  xlab("Housing Price") + 
  theme(plot.title = element_text(hjust = 0.5))

ggplot(select_train, aes(x=MSZoning, y=SalePrice, fill=MSZoning)) + 
  geom_boxplot(alpha=0.3) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
  theme(legend.position="none")+
  ggtitle("Figure 4 Boxplot of SalePrice by MSZoning")+
  theme(plot.title = element_text(hjust = 0.5))

ggplot(select_train, aes(SalePrice)) +
 geom_histogram(aes(fill = BldgType), position = position_stack(reverse = TRUE), binwidth = 20000) +
 coord_flip() + ggtitle("Figure 5 Histogram of SalePrice") +
 ylab("Count") +
 xlab("Housing Price") + 
 theme(plot.title = element_text(hjust = 0.5),legend.position=c(0.9,0.8), legend.background = element_rect(fill="grey90",
                                                                                                           size=0.5, linetype="solid", 
                                                                                                           colour ="black"))

library("car")
scatterplot(select_train$SalePrice~select_train$OverallQual, main="Sale price Vs Overall quality", ylab="Sale price", xlab="Overall quality")

ggplot(select_train, aes(x=TotalBsmtSF, y=SalePrice)) + 
  geom_point(shape=1) +  
  geom_smooth(method=lm , color="red", se=FALSE)+
  ggtitle("Figure 9 Scatter plot of SalePrice and TotalBsmtSF") +
  theme(plot.title = element_text(hjust = 0.4))

CORRELATIONS

cor.test(select_train$SalePrice,select_train$YrSold)
## 
##  Pearson's product-moment correlation
## 
## data:  select_train$SalePrice and select_train$YrSold
## t = -1.1048, df = 1458, p-value = 0.2694
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08010603  0.02241298
## sample estimates:
##         cor 
## -0.02892259
cor.test(select_train$SalePrice,select_train$OverallQual)
## 
##  Pearson's product-moment correlation
## 
## data:  select_train$SalePrice and select_train$OverallQual
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7709644 0.8094376
## sample estimates:
##       cor 
## 0.7909816
cor.test(select_train$SalePrice,select_train$OverallCond)
## 
##  Pearson's product-moment correlation
## 
## data:  select_train$SalePrice and select_train$OverallCond
## t = -2.9819, df = 1458, p-value = 0.002912
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.12864437 -0.02666008
## sample estimates:
##         cor 
## -0.07785589
cor.test(select_train$SalePrice,select_train$GrLivArea)
## 
##  Pearson's product-moment correlation
## 
## data:  select_train$SalePrice and select_train$GrLivArea
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6821200 0.7332695
## sample estimates:
##       cor 
## 0.7086245
cor.test(select_train$SalePrice,select_train$TotalBsmtSF)
## 
##  Pearson's product-moment correlation
## 
## data:  select_train$SalePrice and select_train$TotalBsmtSF
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5805529 0.6445923
## sample estimates:
##       cor 
## 0.6135806
cor.test(select_train$SalePrice,select_train$BedroomAbvGr)
## 
##  Pearson's product-moment correlation
## 
## data:  select_train$SalePrice and select_train$BedroomAbvGr
## t = 6.5159, df = 1458, p-value = 9.927e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1179285 0.2176373
## sample estimates:
##       cor 
## 0.1682132
cor.test(select_train$SalePrice,select_train$TotRmsAbvGrd)
## 
##  Pearson's product-moment correlation
## 
## data:  select_train$SalePrice and select_train$TotRmsAbvGrd
## t = 24.099, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4960020 0.5694337
## sample estimates:
##       cor 
## 0.5337232
cor.test(select_train$SalePrice,select_train$GarageArea)
## 
##  Pearson's product-moment correlation
## 
## data:  select_train$SalePrice and select_train$GarageArea
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5910324 0.6538222
## sample estimates:
##       cor 
## 0.6234314
cor.test(select_train$SalePrice,select_train$OpenPorchSF)
## 
##  Pearson's product-moment correlation
## 
## data:  select_train$SalePrice and select_train$OpenPorchSF
## t = 12.711, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2689114 0.3613039
## sample estimates:
##       cor 
## 0.3158562
cor.test(select_train$SalePrice,select_train$PoolArea)
## 
##  Pearson's product-moment correlation
## 
## data:  select_train$SalePrice and select_train$PoolArea
## t = 3.5435, df = 1458, p-value = 0.0004073
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.04129701 0.14302783
## sample estimates:
##        cor 
## 0.09240355
cor.test(select_train$SalePrice,select_train$YearBuilt)
## 
##  Pearson's product-moment correlation
## 
## data:  select_train$SalePrice and select_train$YearBuilt
## t = 23.424, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4845947 0.5591987
## sample estimates:
##       cor 
## 0.5228973
cor.test(select_train$SalePrice,select_train$HouseStyle2)
## 
##  Pearson's product-moment correlation
## 
## data:  select_train$SalePrice and select_train$HouseStyle2
## t = 6.9937, df = 1458, p-value = 4.064e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1300625 0.2293451
## sample estimates:
##       cor 
## 0.1801626

MODEL

select_train$ExterCond2 <- as.numeric(factor(select_train$ExterCond, 
                                  levels = c("Ex", "Fa","Gd", "TA","Po"),
                                  labels = c(5,2,4,3,1) ,ordered = TRUE))
select_train$HeatingQC2 <- as.numeric(factor(select_train$HeatingQC, 
                                  levels = c("Ex", "Fa","Gd", "TA","Po"),
                                  labels = c(5,2,4,3,1) ,ordered = TRUE))
select_train$CentralAir2 <- as.numeric(factor(select_train$CentralAir, 
                                  levels = c("N", "Y"),
                                  labels = c(0,1) ,ordered = TRUE))
model_var <- c('SalePrice', 
                'OverallQual','OverallCond','YearBuilt','ExterCond2',
                'TotalBsmtSF','HeatingQC2', 
                'CentralAir2','GrLivArea','BedroomAbvGr','KitchenAbvGr',
                'TotRmsAbvGrd','Fireplaces',
                'GarageArea','OpenPorchSF','PoolArea',
                 'YrSold')
heat <- select_train[,model_var]

library("corrplot")
## corrplot 0.84 loaded
corrplot(corr=cor(heat ,use="complete.obs"), method="ellipse")

library("corrgram")
## 
## Attaching package: 'corrgram'
## The following object is masked from 'package:plyr':
## 
##     baseball
corrgram(heat,upper.panel=panel.pie, main="Corrgram of real estate Data variables")

#Using linear regression model

model1 <- lm(SalePrice~.-SalePrice ,data=heat)
summary(model1)
## 
## Call:
## lm(formula = SalePrice ~ . - SalePrice, data = heat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -515179  -18402   -2568   14445  290927 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -5.033e+05  1.462e+06  -0.344  0.73069    
## OverallQual   1.573e+04  1.184e+03  13.287  < 2e-16 ***
## OverallCond   6.680e+03  1.017e+03   6.569 7.06e-11 ***
## YearBuilt     4.733e+02  5.055e+01   9.362  < 2e-16 ***
## ExterCond2   -1.143e+03  2.412e+03  -0.474  0.63564    
## TotalBsmtSF   2.780e+01  2.815e+00   9.878  < 2e-16 ***
## HeatingQC2   -2.364e+03  8.374e+02  -2.823  0.00482 ** 
## CentralAir2  -9.472e+03  4.574e+03  -2.071  0.03855 *  
## GrLivArea     4.975e+01  4.077e+00  12.203  < 2e-16 ***
## BedroomAbvGr -9.825e+03  1.721e+03  -5.710 1.37e-08 ***
## KitchenAbvGr -2.254e+04  4.978e+03  -4.528 6.45e-06 ***
## TotRmsAbvGrd  5.407e+03  1.282e+03   4.218 2.62e-05 ***
## Fireplaces    8.621e+03  1.767e+03   4.880 1.18e-06 ***
## GarageArea    4.292e+01  5.960e+00   7.201 9.61e-13 ***
## OpenPorchSF  -1.514e+01  1.576e+01  -0.961  0.33688    
## PoolArea     -3.215e+01  2.460e+01  -1.307  0.19140    
## YrSold       -2.326e+02  7.277e+02  -0.320  0.74933    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36690 on 1443 degrees of freedom
## Multiple R-squared:  0.7891, Adjusted R-squared:  0.7868 
## F-statistic: 337.4 on 16 and 1443 DF,  p-value: < 2.2e-16
model_lin <- select_train[, model_var]
model_lin$lSalePrice <- log(model_lin$SalePrice)

set.seed(10000)
train.index <- sample(c(1:dim(model_lin)[1]), dim(model_lin)[1]*0.8)
model_lin_train = model_lin[train.index,]
model_lin_valid <- model_lin[-train.index,]

model2<- lm(SalePrice~.-SalePrice ,data=model_lin_train)
summary(model2)
## 
## Call:
## lm(formula = SalePrice ~ . - SalePrice, data = model_lin_train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -97776 -11306  -4266   5851 257436 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -3.169e+06  1.007e+06  -3.146 0.001696 ** 
## OverallQual   1.787e+03  8.864e+02   2.016 0.044082 *  
## OverallCond  -2.592e+03  7.486e+02  -3.463 0.000554 ***
## YearBuilt    -1.009e+02  3.849e+01  -2.620 0.008898 ** 
## ExterCond2   -4.763e+03  1.706e+03  -2.793 0.005313 ** 
## TotalBsmtSF   9.338e+00  2.221e+00   4.205 2.82e-05 ***
## HeatingQC2    5.327e+02  5.864e+02   0.908 0.363812    
## CentralAir2  -2.121e+04  3.262e+03  -6.503 1.17e-10 ***
## GrLivArea     1.984e+01  3.027e+00   6.555 8.40e-11 ***
## BedroomAbvGr -9.137e+03  1.205e+03  -7.585 6.85e-14 ***
## KitchenAbvGr -1.117e+04  3.603e+03  -3.101 0.001975 ** 
## TotRmsAbvGrd  1.075e+03  8.807e+02   1.221 0.222339    
## Fireplaces   -2.811e+03  1.276e+03  -2.203 0.027822 *  
## GarageArea    1.761e+00  4.329e+00   0.407 0.684175    
## OpenPorchSF  -8.383e+00  1.064e+01  -0.788 0.430846    
## PoolArea      5.335e+01  1.579e+01   3.378 0.000754 ***
## YrSold        7.727e+02  5.017e+02   1.540 0.123790    
## lSalePrice    1.705e+05  4.794e+03  35.576  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22580 on 1150 degrees of freedom
## Multiple R-squared:  0.9179, Adjusted R-squared:  0.9167 
## F-statistic: 756.7 on 17 and 1150 DF,  p-value: < 2.2e-16
linreg <- lm(lSalePrice~.-SalePrice, data = model_lin_train)
summary(linreg)
## 
## Call:
## lm(formula = lSalePrice ~ . - SalePrice, data = model_lin_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.58114 -0.06702  0.00342  0.07786  0.44064 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.602e+00  6.190e+00   0.905  0.36565    
## OverallQual   6.725e-02  5.077e-03  13.246  < 2e-16 ***
## OverallCond   5.923e-02  4.259e-03  13.907  < 2e-16 ***
## YearBuilt     3.345e-03  2.152e-04  15.543  < 2e-16 ***
## ExterCond2    1.736e-02  1.047e-02   1.658  0.09769 .  
## TotalBsmtSF   1.782e-04  1.261e-05  14.133  < 2e-16 ***
## HeatingQC2   -1.633e-02  3.573e-03  -4.570 5.41e-06 ***
## CentralAir2   5.788e-02  1.998e-02   2.896  0.00385 ** 
## GrLivArea     2.319e-04  1.732e-05  13.392  < 2e-16 ***
## BedroomAbvGr -1.188e-02  7.399e-03  -1.606  0.10866    
## KitchenAbvGr -8.760e-02  2.200e-02  -3.981 7.29e-05 ***
## TotRmsAbvGrd  1.646e-02  5.394e-03   3.052  0.00233 ** 
## Fireplaces    6.310e-02  7.625e-03   8.276 3.49e-16 ***
## GarageArea    2.488e-04  2.559e-05   9.723  < 2e-16 ***
## OpenPorchSF  -1.497e-07  6.541e-05  -0.002  0.99817    
## PoolArea      1.315e-04  9.702e-05   1.355  0.17571    
## YrSold       -8.613e-04  3.085e-03  -0.279  0.78013    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1389 on 1151 degrees of freedom
## Multiple R-squared:  0.8787, Adjusted R-squared:  0.877 
## F-statistic:   521 on 16 and 1151 DF,  p-value: < 2.2e-16
library(forecast)
## Warning: package 'forecast' was built under R version 3.4.4
#use predict() to make prediction on a new set
pred1 <- predict(linreg,model_lin_valid,type = "response")
residuals <- model_lin_valid$lSalePrice - pred1
linreg_pred <- data.frame("Predicted" = pred1, "Actual" = model_lin_valid$lSalePrice, "Residual" = residuals)
accuracy(pred1, model_lin_valid$lSalePrice)
##                   ME      RMSE       MAE        MPE      MAPE
## Test set -0.01358129 0.2256403 0.1155791 -0.1262696 0.9675185
pred1train <- predict(linreg,model_lin_train,type = "response")
residualstrain <- model_lin_train$lSalePrice - pred1train
linreg_predtrain <- data.frame("Predicted" = pred1train, "Actual" = model_lin_train$lSalePrice, "Residual" = residualstrain)
accuracy(pred1train, model_lin_train$lSalePrice)
##                     ME      RMSE        MAE        MPE      MAPE
## Test set -6.963971e-15 0.1378493 0.09716002 -0.0134659 0.8122805
#To get back actual Sales price values
pred1SP <- exp(pred1)
residualsSP <- model_lin_valid$SalePrice - pred1SP
linreg_predSP <- data.frame("Predicted" = pred1SP, "Actual" = model_lin_valid$SalePrice, "Residual" = residualsSP)
accuracy(pred1SP, model_lin_valid$SalePrice)
##                 ME     RMSE      MAE       MPE     MAPE
## Test set -7889.941 150563.4 27609.39 -7.064565 16.38413
pred2<- predict(model2,model_lin_valid,type = "response")
residuals2 <- model_lin_valid$SalePrice - pred2
model2_pred <- data.frame("Predicted" = pred2, "Actual" = model_lin_valid$SalePrice, "Residual" = residuals2)
accuracy(pred2, model_lin_valid$SalePrice)
##                ME    RMSE      MAE      MPE     MAPE
## Test set 753.8356 24739.1 14932.91 1.386573 9.730968
pred2train<- predict(model2,model_lin_train,type = "response")
residuals2train <- model_lin_train$SalePrice - pred2train
model2_predtrain <- data.frame("Predicted" = pred2train, "Actual" = model_lin_train$SalePrice, "Residual" = residuals2train)
accuracy(pred2train, model_lin_train$SalePrice)
##                    ME     RMSE     MAE      MPE     MAPE
## Test set 6.346432e-09 22409.07 13554.7 1.526914 9.291755
library(gbm)
## Warning: package 'gbm' was built under R version 3.4.4
## Loading required package: survival
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
gbm1 <- gbm(lSalePrice~.-SalePrice, data = model_lin_train, distribution = "gaussian", n.trees = 10000, interaction.depth = 4, shrinkage = 0.01)
summary(gbm1)

##                       var    rel.inf
## OverallQual   OverallQual 32.3510863
## GrLivArea       GrLivArea 21.2835300
## TotalBsmtSF   TotalBsmtSF 12.7418111
## YearBuilt       YearBuilt  9.9735082
## GarageArea     GarageArea  8.2870102
## OverallCond   OverallCond  4.1620693
## OpenPorchSF   OpenPorchSF  2.8860044
## Fireplaces     Fireplaces  2.3703428
## CentralAir2   CentralAir2  1.8781091
## TotRmsAbvGrd TotRmsAbvGrd  0.8918616
## ExterCond2     ExterCond2  0.7514918
## HeatingQC2     HeatingQC2  0.7201534
## YrSold             YrSold  0.6993501
## BedroomAbvGr BedroomAbvGr  0.5546517
## KitchenAbvGr KitchenAbvGr  0.4490200
## PoolArea         PoolArea  0.0000000
#n.trees = seq(from=100 ,to=10000, by=100)
predgbm <- predict(gbm1,model_lin_valid,type = "response", n.trees=100)
residualsgbm <- model_lin_valid$lSalePrice - predgbm
gbm_pred <- data.frame("Predicted" = predgbm, "Actual" = model_lin_valid$lSalePrice, "Residual" = residualsgbm)
accuracy(predgbm, model_lin_valid$lSalePrice)
##                   ME      RMSE       MAE        MPE     MAPE
## Test set -0.01658722 0.2518658 0.1850822 -0.2050172 1.543412
#RANDOM FOREST

library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
RF <- randomForest(lSalePrice ~.-SalePrice, data = model_lin_train, 
                   importance =TRUE,ntree=500,nodesize=7, na.action=na.roughfix)

rf.pred <- predict(RF, newdata=model_lin_valid )
residualsrf <- model_lin_valid$lSalePrice - rf.pred
rf_pred <- data.frame("Predicted" = rf.pred, "Actual" = model_lin_valid$lSalePrice, "Residual" = residualsrf)
accuracy(rf.pred, model_lin_valid$lSalePrice)
##                   ME      RMSE       MAE        MPE      MAPE
## Test set -0.01170986 0.1633046 0.1061606 -0.1182802 0.8878536
# RF using sales price directly

rfmodel <- randomForest(SalePrice ~.-SalePrice, data = model_lin_train, 
                   importance =TRUE,ntree=500,nodesize=7, na.action=na.roughfix)
rfpredsp <- predict(rfmodel, newdata=model_lin_valid )
residualsrfsp <- model_lin_valid$SalePrice - rfpredsp
rf_pred <- data.frame("Predicted" = rfpredsp, "Actual" = model_lin_valid$SalePrice, "Residual" = residualsrfsp)
accuracy(rfpredsp, model_lin_valid$SalePrice)
##                 ME    RMSE      MAE       MPE     MAPE
## Test set -1029.869 13463.5 6288.608 -1.786636 3.685443