#Itroduction: For the final exam we need to import the train data from https://www.kaggle.com/c/house-prices-advanced-regression-techniques and register on Kaggle. I started preparing the libraries, then I imported the data as it showing bellow:
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tinytex)
## Warning: package 'tinytex' was built under R version 4.3.3
library(ggplot2)
datahouse= read.csv("C:/Users/Chafiaa/Downloads/train.csv")
head(datahouse)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 1 60 RL 65 8450 Pave <NA> Reg Lvl
## 2 2 20 RL 80 9600 Pave <NA> Reg Lvl
## 3 3 60 RL 68 11250 Pave <NA> IR1 Lvl
## 4 4 70 RL 60 9550 Pave <NA> IR1 Lvl
## 5 5 60 RL 84 14260 Pave <NA> IR1 Lvl
## 6 6 50 RL 85 14115 Pave <NA> IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 2 AllPub FR2 Gtl Veenker Feedr Norm 1Fam
## 3 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 4 AllPub Corner Gtl Crawfor Norm Norm 1Fam
## 5 AllPub FR2 Gtl NoRidge Norm Norm 1Fam
## 6 AllPub Inside Gtl Mitchel Norm Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1 2Story 7 5 2003 2003 Gable CompShg
## 2 1Story 6 8 1976 1976 Gable CompShg
## 3 2Story 7 5 2001 2002 Gable CompShg
## 4 2Story 7 5 1915 1970 Gable CompShg
## 5 2Story 8 5 2000 2000 Gable CompShg
## 6 1.5Fin 5 5 1993 1995 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1 VinylSd VinylSd BrkFace 196 Gd TA PConc
## 2 MetalSd MetalSd None 0 TA TA CBlock
## 3 VinylSd VinylSd BrkFace 162 Gd TA PConc
## 4 Wd Sdng Wd Shng None 0 TA TA BrkTil
## 5 VinylSd VinylSd BrkFace 350 Gd TA PConc
## 6 VinylSd VinylSd None 0 TA TA Wood
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1 Gd TA No GLQ 706 Unf
## 2 Gd TA Gd ALQ 978 Unf
## 3 Gd TA Mn GLQ 486 Unf
## 4 TA Gd No ALQ 216 Unf
## 5 Gd TA Av GLQ 655 Unf
## 6 Gd TA No GLQ 732 Unf
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 150 856 GasA Ex Y SBrkr
## 2 0 284 1262 GasA Ex Y SBrkr
## 3 0 434 920 GasA Ex Y SBrkr
## 4 0 540 756 GasA Gd Y SBrkr
## 5 0 490 1145 GasA Ex Y SBrkr
## 6 0 64 796 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1 856 854 0 1710 1 0 2
## 2 1262 0 0 1262 0 1 2
## 3 920 866 0 1786 1 0 2
## 4 961 756 0 1717 1 0 1
## 5 1145 1053 0 2198 1 0 2
## 6 796 566 0 1362 1 0 1
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 1 3 1 Gd 8 Typ
## 2 0 3 1 TA 6 Typ
## 3 1 3 1 Gd 6 Typ
## 4 0 3 1 Gd 7 Typ
## 5 1 4 1 Gd 9 Typ
## 6 1 1 1 TA 5 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 0 <NA> Attchd 2003 RFn 2
## 2 1 TA Attchd 1976 RFn 2
## 3 1 TA Attchd 2001 RFn 2
## 4 1 Gd Detchd 1998 Unf 3
## 5 1 TA Attchd 2000 RFn 3
## 6 0 <NA> Attchd 1993 Unf 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 548 TA TA Y 0 61
## 2 460 TA TA Y 298 0
## 3 608 TA TA Y 0 42
## 4 642 TA TA Y 0 35
## 5 836 TA TA Y 192 84
## 6 480 TA TA Y 40 30
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 <NA> <NA> <NA>
## 2 0 0 0 0 <NA> <NA> <NA>
## 3 0 0 0 0 <NA> <NA> <NA>
## 4 272 0 0 0 <NA> <NA> <NA>
## 5 0 0 0 0 <NA> <NA> <NA>
## 6 0 320 0 0 <NA> MnPrv Shed
## MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1 0 2 2008 WD Normal 208500
## 2 0 5 2007 WD Normal 181500
## 3 0 9 2008 WD Normal 223500
## 4 0 2 2006 WD Abnorml 140000
## 5 0 12 2008 WD Normal 250000
## 6 700 10 2009 WD Normal 143000
summary(datahouse)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
#Find X variable that is sekwed to the right:
hist(datahouse$GarageArea)
hist(datahouse$LotFrontage)
hist(datahouse$WoodDeckSF)# X variable
X=datahouse$WoodDeckSF
Y=datahouse$SalePrice #the best dependent variable to choose
quantile(X)
## 0% 25% 50% 75% 100%
## 0 0 0 168 857
quantile(Y)
## 0% 25% 50% 75% 100%
## 34900 129975 163000 214000 755000
#probability:
P(X > .75 | Y > .50) P(A) = P(X > .75) = .25 P(B) = P(Y > .50) = .50 P(A|B) = P(B and A) / P(B) = (.50 * .25 / .50) = .50
#b.P(X>0.75, Y>0.5) P(A) = P(X > .75) = .25 P(B) = P(Y > .50) = .50 P(A,B) = P(A) + P(B) = (.25 + .50) = .75
#c.P(X<0.75 | Y>0.5)
P(A) = P(X < .75) = .75 P(B) = P(Y > .50) = .50 P(A|B) = P(B and A) / P(B) = (.50 * .75 / .50) = .75
#Splitting the data doesn’t change the relationship.
count(subset(datahouse, ( X <= 168 & Y <= 163000)))
## n
## 1 624
count(subset(datahouse, ( X <= 168 & Y > 163000)))
## n
## 1 480
count(subset(datahouse, ( X > 168 & Y <= 163000)))
## n
## 1 108
count(subset(datahouse, ( X > 168 & Y > 163000)))
## n
## 1 248
#P(A|B)=P(A)P(B): = (P(B) * P(A))/P(B) P(A) = 356/1460 = .24 P(B) = 728/1460 = .50 = (.50 * .24)/.50 = .24
P(A) * P(B) = .24 * .5 = .12
chisq.test(matrix(c(624,480,108,248), ncol=2))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: matrix(c(624, 480, 108, 248), ncol = 2)
## X-squared = 72.785, df = 1, p-value < 2.2e-16
#Descriptive and Inferential Statistics:
datahouse1 = datahouse %>%
subset(select = c("Functional", "LotShape", "LotFrontage","LotArea"))# 02mun +02 categorical variables
glimpse(datahouse1)
## Rows: 1,460
## Columns: 4
## $ Functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "M…
## $ LotShape <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", "R…
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, NA…
## $ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 6120,…
head(datahouse1)
## Functional LotShape LotFrontage LotArea
## 1 Typ Reg 65 8450
## 2 Typ Reg 80 9600
## 3 Typ IR1 68 11250
## 4 Typ IR1 60 9550
## 5 Typ IR1 84 14260
## 6 Typ IR1 85 14115
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(datahouse1) + theme_bw()
## Warning: Removed 259 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 259 rows containing non-finite values (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 259 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 259 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 259 rows containing non-finite values (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 259 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 259 rows containing missing values (`geom_point()`).
#scoter plot of X and Y
ggplot(datahouse, aes(x = X, y = Y)) +
geom_point() +
labs(x = "wood deck", y = "sale price", title = "Scatter Plot of wood deck and sale price")
t_test <- t.test(X, Y, conf.level = 0.95)
print(t_test)
##
## Welch Two Sample t-test
##
## data: X and Y
## t = -86.973, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -184905.3 -176748.6
## sample estimates:
## mean of x mean of y
## 94.24452 180921.19589
c <- cor(datahouse[,c("WoodDeckSF","SalePrice")])
c
## WoodDeckSF SalePrice
## WoodDeckSF 1.0000000 0.3244134
## SalePrice 0.3244134 1.0000000
cor.test(datahouse$WoodDeckSF, datahouse$SalePrice,conf.level=.99)
##
## Pearson's product-moment correlation
##
## data: datahouse$WoodDeckSF and datahouse$SalePrice
## t = 13.096, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.2627778 0.3834121
## sample estimates:
## cor
## 0.3244134
#If 100 sample distibutions are taken from the population, 99 of them are likely to have correlations within the confidence interval [0.2627778, 0.3834121], The p-value is negligible, so reject the null hypothesis that there is no correlation (0.3 away from 1).
#Linear Algebra:
p <- solve(c) #precision matrix
p
## WoodDeckSF SalePrice
## WoodDeckSF 1.117623 -0.362572
## SalePrice -0.362572 1.117623
p %*% c #precision * correlation
## WoodDeckSF SalePrice
## WoodDeckSF 1 0
## SalePrice 0 1
c %*% p #correlation *precision
## WoodDeckSF SalePrice
## WoodDeckSF 1 0
## SalePrice 0 1
pca= princomp(c, cor = TRUE)
pca
## Call:
## princomp(x = c, cor = TRUE)
##
## Standard deviations:
## Comp.1 Comp.2
## 1.414214 0.000000
##
## 2 variables and 2 observations.
PC = pca$scores
cor_PC = cor(PC)
## Warning in cor(PC): the standard deviation is zero
cor_PC
## Comp.1 Comp.2
## Comp.1 1 NA
## Comp.2 NA 1
summary_pca = summary(pca)
summary_pca
## Importance of components:
## Comp.1 Comp.2
## Standard deviation 1.414214 0
## Proportion of Variance 1.000000 0
## Cumulative Proportion 1.000000 1
#Discussion: 100% variance this means my 02 variables are pretty extreme case of linear dependence, which I find a little suprising!
#Calculus-Based Probability & Statistics:
library(MASS)
## Warning: package 'MASS' was built under R version 4.3.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
X_shifted = X + 1
exp_fit <- fitdistr(X_shifted, "exponential")
lambda <- exp_fit$estimate # Extract rate parameter λ
samples <- rexp(1000, rate = lambda)
par(mfrow = c(1, 2))
hist(X, main = "wood deck histogram", xlab = "wood deck", col = "purple")
hist(samples, main = "Ft Exponential Distribution", xlab = "Sample Values", col = "green")
data_per <- quantile(X, probs = c(0.05, 0.95))
data_per
## 5% 95%
## 0 335
mean_X = mean(X)
std_X = sd(X)
n = length(X)
z = qnorm(1 - 0.05/2)
lower_ci <- mean_X - z * (std_X / sqrt(n))
upper_ci <- mean_X + z * (std_X / sqrt(n))
print(paste("95% Confidence Interval is: ", lower_ci, ", ", upper_ci))
## [1] "95% Confidence Interval is: 87.8153169957939 , 100.673724100096"
#modeling
library(tidyverse)
library(MASS)
library(dplyr)
model=lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual +
OverallCond + YearBuilt + MasVnrArea + X1stFlrSF + X2ndFlrSF + BsmtFullBath + BedroomAbvGr + GarageCars + WoodDeckSF + ScreenPorch + PoolArea, data = datahouse)
summary(model)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual +
## OverallCond + YearBuilt + MasVnrArea + X1stFlrSF + X2ndFlrSF +
## BsmtFullBath + BedroomAbvGr + GarageCars + WoodDeckSF + ScreenPorch +
## PoolArea, data = datahouse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -452876 -17230 -1448 13892 292879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.548e+05 8.793e+04 -9.721 < 2e-16 ***
## MSSubClass -1.907e+02 2.419e+01 -7.883 6.26e-15 ***
## LotArea 4.536e-01 1.001e-01 4.532 6.33e-06 ***
## OverallQual 1.958e+04 1.097e+03 17.860 < 2e-16 ***
## OverallCond 5.376e+03 9.265e+02 5.803 8.02e-09 ***
## YearBuilt 3.961e+02 4.495e+01 8.810 < 2e-16 ***
## MasVnrArea 3.242e+01 5.859e+00 5.533 3.73e-08 ***
## X1stFlrSF 7.086e+01 3.723e+00 19.033 < 2e-16 ***
## X2ndFlrSF 6.091e+01 3.329e+00 18.295 < 2e-16 ***
## BsmtFullBath 1.370e+04 1.924e+03 7.124 1.65e-12 ***
## BedroomAbvGr -7.986e+03 1.446e+03 -5.522 3.97e-08 ***
## GarageCars 1.048e+04 1.704e+03 6.147 1.02e-09 ***
## WoodDeckSF 2.697e+01 7.964e+00 3.386 0.000727 ***
## ScreenPorch 5.617e+01 1.685e+01 3.334 0.000878 ***
## PoolArea -2.957e+01 2.348e+01 -1.260 0.208031
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35200 on 1437 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.8048, Adjusted R-squared: 0.8029
## F-statistic: 423.2 on 14 and 1437 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model)
model1=lm(formula = log(SalePrice) ~ MSSubClass + LotArea + OverallQual + OverallCond + YearBuilt + X1stFlrSF + GrLivArea + BsmtFullBath + Fireplaces + GarageCars + ScreenPorch, data = datahouse)
summary(model1)
##
## Call:
## lm(formula = log(SalePrice) ~ MSSubClass + LotArea + OverallQual +
## OverallCond + YearBuilt + X1stFlrSF + GrLivArea + BsmtFullBath +
## Fireplaces + GarageCars + ScreenPorch, data = datahouse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.03693 -0.06833 0.00320 0.08037 0.49918
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.288e+00 3.718e-01 8.844 < 2e-16 ***
## MSSubClass -7.331e-04 1.022e-04 -7.171 1.18e-12 ***
## LotArea 2.035e-06 4.306e-07 4.728 2.49e-06 ***
## OverallQual 9.463e-02 4.637e-03 20.407 < 2e-16 ***
## OverallCond 5.655e-02 3.938e-03 14.362 < 2e-16 ***
## YearBuilt 3.671e-03 1.907e-04 19.249 < 2e-16 ***
## X1stFlrSF 5.062e-05 1.454e-05 3.481 0.000514 ***
## GrLivArea 2.430e-04 1.141e-05 21.287 < 2e-16 ***
## BsmtFullBath 7.341e-02 8.116e-03 9.045 < 2e-16 ***
## Fireplaces 4.503e-02 7.313e-03 6.157 9.57e-10 ***
## GarageCars 7.773e-02 7.233e-03 10.747 < 2e-16 ***
## ScreenPorch 2.960e-04 7.234e-05 4.092 4.51e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1506 on 1448 degrees of freedom
## Multiple R-squared: 0.8589, Adjusted R-squared: 0.8579
## F-statistic: 801.5 on 11 and 1448 DF, p-value: < 2.2e-16
#both models look good , high R^2 and QQ plot most values on the line.
par(mfrow=c(2,2))
plot(model1)
library(flextable)
## Warning: package 'flextable' was built under R version 4.3.3
##
## Attaching package: 'flextable'
## The following object is masked from 'package:purrr':
##
## compose
housetest <- read.csv("C:/Users/Chafiaa/Downloads/test.csv")
pred = predict(model1, housetest) %>%
exp() %>%
cbind(housetest$Id, .) %>%
as.data.frame() %>%
set_names(c("Id","SalePrice"))
head(pred) %>%
flextable()
Id | SalePrice |
|---|---|
1,461 | 118,686.3 |
1,462 | 142,179.6 |
1,463 | 163,074.8 |
1,464 | 187,737.6 |
1,465 | 186,498.0 |
1,466 | 174,919.8 |
pred %>%
replace(is.na(.), 0) %>%
write.csv("Housing_MarketValue.csv",row.names=F)