install.packages("ggplot2", repos='https://mirrors.nics.utk.edu/cran/')
## Installing package into 'C:/Users/jenny_000/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\jenny_000\AppData\Local\Temp\RtmpOW1lV4\downloaded_packages
install.packages("corrplot", , repos='https://mirrors.nics.utk.edu/cran/')
## Installing package into 'C:/Users/jenny_000/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## package 'corrplot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\jenny_000\AppData\Local\Temp\RtmpOW1lV4\downloaded_packages
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
df_training <- read.csv("https://raw.githubusercontent.com/JennierJ/CUNY_DATA_605/master/Final_Exam/train.csv")
head(df_training)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl CollgCr Norm
## 2 Lvl AllPub FR2 Gtl Veenker Feedr
## 3 Lvl AllPub Inside Gtl CollgCr Norm
## 4 Lvl AllPub Corner Gtl Crawfor Norm
## 5 Lvl AllPub FR2 Gtl NoRidge Norm
## 6 Lvl AllPub Inside Gtl Mitchel Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 7 5 2003
## 2 Norm 1Fam 1Story 6 8 1976
## 3 Norm 1Fam 2Story 7 5 2001
## 4 Norm 1Fam 2Story 7 5 1915
## 5 Norm 1Fam 2Story 8 5 2000
## 6 Norm 1Fam 1.5Fin 5 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2003 Gable CompShg VinylSd VinylSd BrkFace
## 2 1976 Gable CompShg MetalSd MetalSd None
## 3 2002 Gable CompShg VinylSd VinylSd BrkFace
## 4 1970 Gable CompShg Wd Sdng Wd Shng None
## 5 2000 Gable CompShg VinylSd VinylSd BrkFace
## 6 1995 Gable CompShg VinylSd VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 196 Gd TA PConc Gd TA No
## 2 0 TA TA CBlock Gd TA Gd
## 3 162 Gd TA PConc Gd TA Mn
## 4 0 TA TA BrkTil TA Gd No
## 5 350 Gd TA PConc Gd TA Av
## 6 0 TA TA Wood Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 GLQ 706 Unf 0 150 856
## 2 ALQ 978 Unf 0 284 1262
## 3 GLQ 486 Unf 0 434 920
## 4 ALQ 216 Unf 0 540 756
## 5 GLQ 655 Unf 0 490 1145
## 6 GLQ 732 Unf 0 64 796
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA Ex Y SBrkr 856 854 0
## 2 GasA Ex Y SBrkr 1262 0 0
## 3 GasA Ex Y SBrkr 920 866 0
## 4 GasA Gd Y SBrkr 961 756 0
## 5 GasA Ex Y SBrkr 1145 1053 0
## 6 GasA Ex Y SBrkr 796 566 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1710 1 0 2 1 3
## 2 1262 0 1 2 0 3
## 3 1786 1 0 2 1 3
## 4 1717 1 0 1 0 3
## 5 2198 1 0 2 1 4
## 6 1362 1 0 1 1 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 2003 RFn 2 548 TA
## 2 Attchd 1976 RFn 2 460 TA
## 3 Attchd 2001 RFn 2 608 TA
## 4 Detchd 1998 Unf 3 642 TA
## 5 Attchd 2000 RFn 3 836 TA
## 6 Attchd 1993 Unf 2 480 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 0 61 0 0
## 2 TA Y 298 0 0 0
## 3 TA Y 0 42 0 0
## 4 TA Y 0 35 272 0
## 5 TA Y 192 84 0 0
## 6 TA Y 40 30 0 320
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 2 2008
## 2 0 0 <NA> <NA> <NA> 0 5 2007
## 3 0 0 <NA> <NA> <NA> 0 9 2008
## 4 0 0 <NA> <NA> <NA> 0 2 2006
## 5 0 0 <NA> <NA> <NA> 0 12 2008
## 6 0 0 <NA> MnPrv Shed 700 10 2009
## SaleType SaleCondition SalePrice
## 1 WD Normal 208500
## 2 WD Normal 181500
## 3 WD Normal 223500
## 4 WD Abnorml 140000
## 5 WD Normal 250000
## 6 WD Normal 143000
summary(df_training)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 C (all): 10 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 FV : 65 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 RH : 16 Median : 69.00
## Mean : 730.5 Mean : 56.9 RL :1151 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 RM : 218 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape LandContour
## Min. : 1300 Grvl: 6 Grvl: 50 IR1:484 Bnk: 63
## 1st Qu.: 7554 Pave:1454 Pave: 41 IR2: 41 HLS: 50
## Median : 9478 NA's:1369 IR3: 10 Low: 36
## Mean : 10517 Reg:925 Lvl:1311
## 3rd Qu.: 11602
## Max. :215245
##
## Utilities LotConfig LandSlope Neighborhood Condition1
## AllPub:1459 Corner : 263 Gtl:1382 NAmes :225 Norm :1260
## NoSeWa: 1 CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81
## FR2 : 47 Sev: 13 OldTown:113 Artery : 48
## FR3 : 4 Edwards:100 RRAn : 26
## Inside :1052 Somerst: 86 PosN : 19
## Gilbert: 79 RRAe : 11
## (Other):707 (Other): 15
## Condition2 BldgType HouseStyle OverallQual
## Norm :1445 1Fam :1220 1Story :726 Min. : 1.000
## Feedr : 6 2fmCon: 31 2Story :445 1st Qu.: 5.000
## Artery : 2 Duplex: 52 1.5Fin :154 Median : 6.000
## PosN : 2 Twnhs : 43 SLvl : 65 Mean : 6.099
## RRNn : 2 TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.000
## PosA : 1 1.5Unf : 14 Max. :10.000
## (Other): 2 (Other): 19
## OverallCond YearBuilt YearRemodAdd RoofStyle
## Min. :1.000 Min. :1872 Min. :1950 Flat : 13
## 1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967 Gable :1141
## Median :5.000 Median :1973 Median :1994 Gambrel: 11
## Mean :5.575 Mean :1971 Mean :1985 Hip : 286
## 3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004 Mansard: 7
## Max. :9.000 Max. :2010 Max. :2010 Shed : 2
##
## RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea
## CompShg:1434 VinylSd:515 VinylSd:504 BrkCmn : 15 Min. : 0.0
## Tar&Grv: 11 HdBoard:222 MetalSd:214 BrkFace:445 1st Qu.: 0.0
## WdShngl: 6 MetalSd:220 HdBoard:207 None :864 Median : 0.0
## WdShake: 5 Wd Sdng:206 Wd Sdng:197 Stone :128 Mean : 103.7
## ClyTile: 1 Plywood:108 Plywood:142 NA's : 8 3rd Qu.: 166.0
## Membran: 1 CemntBd: 61 CmentBd: 60 Max. :1600.0
## (Other): 2 (Other):128 (Other):136 NA's :8
## ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## Ex: 52 Ex: 3 BrkTil:146 Ex :121 Fa : 45 Av :221
## Fa: 14 Fa: 28 CBlock:634 Fa : 35 Gd : 65 Gd :134
## Gd:488 Gd: 146 PConc :647 Gd :618 Po : 2 Mn :114
## TA:906 Po: 1 Slab : 24 TA :649 TA :1311 No :953
## TA:1282 Stone : 6 NA's: 37 NA's: 37 NA's: 38
## Wood : 3
##
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## ALQ :220 Min. : 0.0 ALQ : 19 Min. : 0.00
## BLQ :148 1st Qu.: 0.0 BLQ : 33 1st Qu.: 0.00
## GLQ :418 Median : 383.5 GLQ : 14 Median : 0.00
## LwQ : 74 Mean : 443.6 LwQ : 46 Mean : 46.55
## Rec :133 3rd Qu.: 712.2 Rec : 54 3rd Qu.: 0.00
## Unf :430 Max. :5644.0 Unf :1256 Max. :1474.00
## NA's: 37 NA's: 38
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir
## Min. : 0.0 Min. : 0.0 Floor: 1 Ex:741 N: 95
## 1st Qu.: 223.0 1st Qu.: 795.8 GasA :1428 Fa: 49 Y:1365
## Median : 477.5 Median : 991.5 GasW : 18 Gd:241
## Mean : 567.2 Mean :1057.4 Grav : 7 Po: 1
## 3rd Qu.: 808.0 3rd Qu.:1298.2 OthW : 2 TA:428
## Max. :2336.0 Max. :6110.0 Wall : 4
##
## Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## FuseA: 94 Min. : 334 Min. : 0 Min. : 0.000
## FuseF: 27 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## FuseP: 3 Median :1087 Median : 0 Median : 0.000
## Mix : 1 Mean :1163 Mean : 347 Mean : 5.845
## SBrkr:1334 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## NA's : 1 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual
## Min. :0.0000 Min. :0.000 Min. :0.000 Ex:100
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 Fa: 39
## Median :0.0000 Median :3.000 Median :1.000 Gd:586
## Mean :0.3829 Mean :2.866 Mean :1.047 TA:735
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000
## Max. :2.0000 Max. :8.000 Max. :3.000
##
## TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType
## Min. : 2.000 Maj1: 14 Min. :0.000 Ex : 24 2Types : 6
## 1st Qu.: 5.000 Maj2: 5 1st Qu.:0.000 Fa : 33 Attchd :870
## Median : 6.000 Min1: 31 Median :1.000 Gd :380 Basment: 19
## Mean : 6.518 Min2: 34 Mean :0.613 Po : 20 BuiltIn: 88
## 3rd Qu.: 7.000 Mod : 15 3rd Qu.:1.000 TA :313 CarPort: 9
## Max. :14.000 Sev : 1 Max. :3.000 NA's:690 Detchd :387
## Typ :1360 NA's : 81
## GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## Min. :1900 Fin :352 Min. :0.000 Min. : 0.0 Ex : 3
## 1st Qu.:1961 RFn :422 1st Qu.:1.000 1st Qu.: 334.5 Fa : 48
## Median :1980 Unf :605 Median :2.000 Median : 480.0 Gd : 14
## Mean :1979 NA's: 81 Mean :1.767 Mean : 473.0 Po : 3
## 3rd Qu.:2002 3rd Qu.:2.000 3rd Qu.: 576.0 TA :1311
## Max. :2010 Max. :4.000 Max. :1418.0 NA's: 81
## NA's :81
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## Ex : 2 N: 90 Min. : 0.00 Min. : 0.00 Min. : 0.00
## Fa : 35 P: 30 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Gd : 9 Y:1340 Median : 0.00 Median : 25.00 Median : 0.00
## Po : 7 Mean : 94.24 Mean : 46.66 Mean : 21.95
## TA :1326 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00
## NA's: 81 Max. :857.00 Max. :547.00 Max. :552.00
##
## X3SsnPorch ScreenPorch PoolArea PoolQC
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Ex : 2
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2
## Median : 0.00 Median : 0.00 Median : 0.000 Gd : 3
## Mean : 3.41 Mean : 15.06 Mean : 2.759 NA's:1453
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :508.00 Max. :480.00 Max. :738.000
##
## Fence MiscFeature MiscVal MoSold
## GdPrv: 59 Gar2: 2 Min. : 0.00 Min. : 1.000
## GdWo : 54 Othr: 2 1st Qu.: 0.00 1st Qu.: 5.000
## MnPrv: 157 Shed: 49 Median : 0.00 Median : 6.000
## MnWw : 11 TenC: 1 Mean : 43.49 Mean : 6.322
## NA's :1179 NA's:1406 3rd Qu.: 0.00 3rd Qu.: 8.000
## Max. :15500.00 Max. :12.000
##
## YrSold SaleType SaleCondition SalePrice
## Min. :2006 WD :1267 Abnorml: 101 Min. : 34900
## 1st Qu.:2007 New : 122 AdjLand: 4 1st Qu.:129975
## Median :2008 COD : 43 Alloca : 12 Median :163000
## Mean :2008 ConLD : 9 Family : 20 Mean :180921
## 3rd Qu.:2009 ConLI : 5 Normal :1198 3rd Qu.:214000
## Max. :2010 ConLw : 5 Partial: 125 Max. :755000
## (Other): 9
df_test <- read.csv("https://raw.githubusercontent.com/JennierJ/CUNY_DATA_605/master/Final_Exam/test.csv")
# To take a look at the distribution of LotArea
df1 <- subset(df_training, select = c("LotArea", "SalePrice"))
head(df1)
## LotArea SalePrice
## 1 8450 208500
## 2 9600 181500
## 3 11250 223500
## 4 9550 140000
## 5 14260 250000
## 6 14115 143000
ggplot(df1, aes(x = LotArea)) + geom_histogram(binwidth=550, color="purple")
ggplot(df1, aes(x = SalePrice)) + geom_histogram(binwidth = 1500, color="purple")
#### Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.
summary(df1)
## LotArea SalePrice
## Min. : 1300 Min. : 34900
## 1st Qu.: 7554 1st Qu.:129975
## Median : 9478 Median :163000
## Mean : 10517 Mean :180921
## 3rd Qu.: 11602 3rd Qu.:214000
## Max. :215245 Max. :755000
# The probability where the LotArea is greater than 7554 given that the SalePrice is greater than the 129975.
total <- nrow(df1)
data_p1 <- subset(df1, df1$LotArea > 7554 & df1$SalePrice > 129975)
p1 <- nrow(data_p1)
data_p2 <- subset(df1, df1$SalePrice > 129975)
p2 <- nrow(data_p2)
prop_1 <- (p1/p2) *100
prop_1
## [1] 82.00913
# The probability where the LotArea is greater than 7554 and the SalePrice is greater than the 129975.
data_p1 <- subset(df1, df1$LotArea > 7554 & df1$SalePrice > 129975)
p1 <- nrow(data_p1)
prop_2 <- (p1/total) *100
prop_2
## [1] 61.50685
# The probability where the LotArea is less than 7554 and the SalePrice is greater than the 129975.
data_p3 <- subset(df1, df1$LotArea < 7554 & df1$SalePrice > 129975)
p3 <- nrow(data_p3)
prop_3 <- (p3/p2) *100
prop_3
## [1] 17.99087
A <- subset(df1, df1$LotArea > 7554)
PA <- (nrow(A)/total)* 100
PA
## [1] 75
# P(A) = 0.75
B <- subset(df1, df1$SalePrice > 129975)
PB <- (nrow(B)/total) * 100
PB
## [1] 75
# P(B) = 0.75
AB <- subset(df1, df1$LotArea > 7554 & df1$SalePrice > 129975)
PAB <- (nrow(AB)/total) * 100
PAB
## [1] 61.50685
# P(AB) = 0.61
\[ P(A)P(B)\neq P(AB) \] ##### Variable A and B are not independent and splitting the data does not make them independent.
chisq.test(df1)
##
## Pearson's Chi-squared test
##
## data: df1
## X-squared = 7125100, df = 1459, p-value < 2.2e-16
summary(df1$LotArea)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1300 7554 9478 10520 11600 215200
ggplot(df1, aes(x = df1$LotArea, y=df1$SalePrice)) + geom_point(size=2, shape = 18, color="blue") +
labs(title = "SalePrice vs. LotArea", x = "LotArea", y = "SalePrice")
ggplot(df_training, aes(YearBuilt)) + geom_histogram(color="darkblue", fill="lightblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df_training, aes(GrLivArea)) + geom_histogram(color="darkblue", fill="lightblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df_training, aes(SalePrice)) + geom_histogram(color="darkblue", fill="lightblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df_training, aes(HouseStyle)) + geom_bar(color="darkblue", fill="lightblue")
ggplot(df_training, aes(BldgType)) + geom_bar(color="darkblue", fill="lightblue")
ggplot(df_training, aes(MSZoning)) + geom_bar(color="darkblue", fill="lightblue")
#### Descriptive and Inferential Statistics. Derive a correlation matrix for any THREE quantitative variables in the dataset. I am analyzing the coorelations between LotArea, OverallQual and GrlivArea.
colnames(df_training)
## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "Electrical" "X1stFlrSF"
## [45] "X2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
## [49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
## [61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
## [65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [69] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea"
## [73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
## [77] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [81] "SalePrice"
df_matrix <- df_training[c("LotArea", "GrLivArea", "SalePrice")]
cor_matrix <- cor(df_matrix, use="complete.obs", method="kendall")
corrplot(cor_matrix, method = "circle")
##### Correlation Test
cor.test(df_training$LotArea, df_training$SalePrice, conf.level = 0.92)
##
## Pearson's product-moment correlation
##
## data: df_training$LotArea and df_training$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.2206794 0.3059759
## sample estimates:
## cor
## 0.2638434
cor.test(df_training$GrLivArea, df_training$SalePrice, conf.level = 0.92)
##
## Pearson's product-moment correlation
##
## data: df_training$GrLivArea and df_training$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.6850407 0.7307245
## sample estimates:
## cor
## 0.7086245
pre_matrix <- solve(cor_matrix)
round(pre_matrix %*% cor_matrix)
## LotArea GrLivArea SalePrice
## LotArea 1 0 0
## GrLivArea 0 1 0
## SalePrice 0 0 1
lr <- lm(SalePrice ~ LotArea + LotFrontage + BldgType +
OverallQual + BsmtQual + GarageArea + GarageYrBlt +
GrLivArea + TotalBsmtSF, data = df_training)
summary(lr)
##
## Call:
## lm(formula = SalePrice ~ LotArea + LotFrontage + BldgType + OverallQual +
## BsmtQual + GarageArea + GarageYrBlt + GrLivArea + TotalBsmtSF,
## data = df_training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -488971 -17498 -706 15316 261798
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.962e+05 1.460e+05 -2.028 0.042807 *
## LotArea 7.263e-01 1.618e-01 4.488 7.94e-06 ***
## LotFrontage -1.650e+02 6.425e+01 -2.569 0.010338 *
## BldgType2fmCon -1.904e+04 8.886e+03 -2.142 0.032395 *
## BldgTypeDuplex -2.797e+04 8.208e+03 -3.407 0.000680 ***
## BldgTypeTwnhs -3.152e+04 7.306e+03 -4.313 1.75e-05 ***
## BldgTypeTwnhsE -1.708e+04 4.922e+03 -3.471 0.000539 ***
## OverallQual 2.095e+04 1.463e+03 14.321 < 2e-16 ***
## BsmtQualFa -6.976e+04 9.359e+03 -7.453 1.86e-13 ***
## BsmtQualGd -5.056e+04 4.601e+03 -10.988 < 2e-16 ***
## BsmtQualTA -6.244e+04 5.921e+03 -10.545 < 2e-16 ***
## GarageArea 3.940e+01 9.245e+00 4.262 2.20e-05 ***
## GarageYrBlt 1.510e+02 7.417e+01 2.036 0.042034 *
## GrLivArea 4.606e+01 3.134e+00 14.696 < 2e-16 ***
## TotalBsmtSF 1.997e+01 3.831e+00 5.214 2.21e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38490 on 1088 degrees of freedom
## (357 observations deleted due to missingness)
## Multiple R-squared: 0.7893, Adjusted R-squared: 0.7866
## F-statistic: 291.1 on 14 and 1088 DF, p-value: < 2.2e-16
# Update my model with backward Elimination Process
lr1 <- update(lr, . ~ . - GarageYrBlt)
summary(lr1)
##
## Call:
## lm(formula = SalePrice ~ LotArea + LotFrontage + BldgType + OverallQual +
## BsmtQual + GarageArea + GrLivArea + TotalBsmtSF, data = df_training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -486529 -16814 -782 15762 264867
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.242e+04 1.061e+04 1.171 0.241819
## LotArea 7.101e-01 1.587e-01 4.475 8.39e-06 ***
## LotFrontage -1.676e+02 6.158e+01 -2.722 0.006577 **
## BldgType2fmCon -1.787e+04 7.715e+03 -2.316 0.020731 *
## BldgTypeDuplex -2.079e+04 6.845e+03 -3.037 0.002441 **
## BldgTypeTwnhs -2.740e+04 6.779e+03 -4.042 5.66e-05 ***
## BldgTypeTwnhsE -1.685e+04 4.791e+03 -3.516 0.000455 ***
## OverallQual 2.021e+04 1.364e+03 14.815 < 2e-16 ***
## BsmtQualFa -7.946e+04 8.559e+03 -9.284 < 2e-16 ***
## BsmtQualGd -5.292e+04 4.494e+03 -11.776 < 2e-16 ***
## BsmtQualTA -7.070e+04 5.382e+03 -13.135 < 2e-16 ***
## GarageArea 4.269e+01 6.763e+00 6.312 3.91e-10 ***
## GrLivArea 4.410e+01 2.906e+00 15.175 < 2e-16 ***
## TotalBsmtSF 2.124e+01 3.705e+00 5.733 1.26e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38070 on 1156 degrees of freedom
## (290 observations deleted due to missingness)
## Multiple R-squared: 0.7941, Adjusted R-squared: 0.7918
## F-statistic: 343 on 13 and 1156 DF, p-value: < 2.2e-16
# Residual Analysis
plot(fitted(lr1), resid(lr1))
qqnorm(resid(lr1))
predicted_data <- predict(lr1, newdata = df_test)
submission <- data.frame(predicted_data)
summary(submission)
## predicted_data
## Min. : -2620
## 1st Qu.:127314
## Median :162729
## Mean :178581
## 3rd Qu.:214093
## Max. :603124
## NA's :265
submission$predicted_data[is.na(submission$predicted_data)] <- 162729
write.csv(submission, "KaggleSubmission.csv")